From 89cbf1de5583dd98530f5d3d20786238cd7267d4 Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Thu, 1 Jun 2023 11:42:15 -0700 Subject: [PATCH 001/123] peft eval directly from ckpt (#6785) * update to load from ckpt Signed-off-by: arendu * update Signed-off-by: arendu * load ckpt peft model Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update style Signed-off-by: arendu --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../conf/megatron_gpt_peft_eval_config.yaml | 2 ++ .../tuning/megatron_gpt_peft_eval.py | 32 +++++++++++++++---- nemo/collections/nlp/parts/nlp_overrides.py | 12 +++++-- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml index d7ebd69f31be..c430bd7fab5f 100755 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml @@ -70,6 +70,8 @@ model: peft: peft_scheme: "adapter" # can be either adapter,ia3, or ptuning restore_from_path: null + restore_from_ckpt_name: null + restore_from_hparams_path: null # Used for adapter peft training adapter_tuning: diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py index b45f5da69e89..a5bf1ee552cb 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py @@ -104,9 +104,17 @@ def main(cfg) -> None: trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) if cfg.model.peft.restore_from_path: - peft_model_cfg = MegatronGPTPEFTModel.restore_from( - restore_path=cfg.model.peft.restore_from_path, trainer=trainer, return_config=True, - ) + if cfg.model.peft.restore_from_path.endswith(".nemo"): + peft_model_cfg = MegatronGPTPEFTModel.restore_from( + restore_path=cfg.model.peft.restore_from_path, trainer=trainer, return_config=True, + ) + elif cfg.model.peft.restore_from_hparams_path: # not a .nemo model we expect a hparams.yaml file + peft_model_cfg = OmegaConf.to_container(OmegaConf.load(cfg.model.peft.restore_from_hparams_path).cfg) + peft_model_cfg = OmegaConf.create(peft_model_cfg) + # extract dict inside cfg key and convert it to DictConfig + # this allows interpolation to work the same way as config from the .restore_from method + else: + raise RuntimeError("This script requires a .nemo peft model or path to hparams.yaml (and a ckpt path).") else: peft_model_cfg = MegatronGPTSFTModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True, @@ -127,9 +135,21 @@ def main(cfg) -> None: cfg.inference.tokens_to_generate = peft_model_cfg.data.test_ds.tokens_to_generate if cfg.model.peft.restore_from_path: - save_restore_connector = PEFTSaveRestoreConnector( - peft_model_nemo_path=cfg.model.peft.restore_from_path, peft_model_ckpt_path=None, - ) + if cfg.model.peft.restore_from_path.endswith(".nemo"): + save_restore_connector = PEFTSaveRestoreConnector( + peft_model_nemo_path=cfg.model.peft.restore_from_path, peft_model_ckpt_path=None, + ) + else: + # attempting to load a ckpt peft model. + if cfg.model.peft.restore_from_ckpt_name: + ckpt_name = cfg.model.peft.restore_from_ckpt_name + else: + ckpt_name = "model_weights.ckpt" + save_restore_connector = PEFTSaveRestoreConnector( + peft_model_nemo_path=None, + peft_model_ckpt_path=cfg.model.peft.restore_from_path, + peft_model_ckpt_name=ckpt_name, + ) else: save_restore_connector = NLPSaveRestoreConnector() diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index a43e06669489..5a0f028ddbe9 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -404,14 +404,20 @@ class PEFTSaveRestoreConnector(NLPSaveRestoreConnector): Args: peft_model_nemo_path: Used to provide the .nemo file corresponding to a PEFT model (which will only contain a small set of params) - peft_model_ckpt_path: Used to provide the path to .ckpt files of a PEFt model. This is required when no .nemo is available (yet) such as during resumed training. + peft_model_ckpt_path: Used to provide the path to .ckpt files of a PEFT model. This is required when no .nemo is available (yet) such as during resumed training. + peft_model_ckpt_name: The filename of the ckpt file inside the peft_model_ckpt_path folder If both are provided the peft_model_ckpt_path takes precedence. If neither are provided, PEFT params are initialized at random (not loaded from any external source). """ - def __init__(self, peft_model_nemo_path: Optional[str] = None, peft_model_ckpt_path: Optional[str] = None) -> None: + def __init__( + self, + peft_model_nemo_path: Optional[str] = None, + peft_model_ckpt_path: Optional[str] = None, + peft_model_ckpt_name: Optional[str] = "model_weights.ckpt", + ) -> None: super().__init__() - self.peft_model_ckpt_name = "model_weights.ckpt" + self.peft_model_ckpt_name = peft_model_ckpt_name if peft_model_ckpt_path: # First we will try to load a adapter ckpt path # this is given priority over loading from nemo path to make resumption of training possible From 23f1c429a9a7a48bf77fc3d689c401fc5ccb34b7 Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Thu, 1 Jun 2023 14:55:57 -0400 Subject: [PATCH 002/123] Add Frame-VAD examples and utils (#6463) * add model, dataset, necessary utils and tests Signed-off-by: stevehuang52 * fix tarred data Signed-off-by: stevehuang52 * fix typo Signed-off-by: stevehuang52 * add fvad examples and update utils Signed-off-by: stevehuang52 * add copyright Signed-off-by: stevehuang52 * refactor and add tests Signed-off-by: stevehuang52 * update dataset Signed-off-by: stevehuang52 * update test Signed-off-by: stevehuang52 * refactor Signed-off-by: stevehuang52 * refactor Signed-off-by: stevehuang52 * fix typos Signed-off-by: stevehuang52 --------- Signed-off-by: stevehuang52 Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> Co-authored-by: Taejin Park --- .../conf/marblenet/marblenet_3x2x64_20ms.yaml | 209 +++++++++ .../conf/vad/frame_vad_infer_postprocess.yaml | 38 ++ .../speech_classification/frame_vad_infer.py | 188 ++++++++ .../speech_to_frame_label.py | 61 +++ nemo/collections/asr/data/audio_to_label.py | 22 +- .../asr/data/audio_to_label_dataset.py | 6 +- .../asr/models/classification_models.py | 10 +- .../asr/parts/preprocessing/features.py | 4 +- .../asr/parts/preprocessing/segment.py | 30 +- nemo/collections/asr/parts/utils/vad_utils.py | 439 +++++++++++++++--- .../asr/test_asr_classification_model.py | 47 +- tests/collections/asr/utils/test_vad_utils.py | 126 +++++ 12 files changed, 1026 insertions(+), 154 deletions(-) create mode 100644 examples/asr/conf/marblenet/marblenet_3x2x64_20ms.yaml create mode 100644 examples/asr/conf/vad/frame_vad_infer_postprocess.yaml create mode 100644 examples/asr/speech_classification/frame_vad_infer.py create mode 100644 examples/asr/speech_classification/speech_to_frame_label.py create mode 100644 tests/collections/asr/utils/test_vad_utils.py diff --git a/examples/asr/conf/marblenet/marblenet_3x2x64_20ms.yaml b/examples/asr/conf/marblenet/marblenet_3x2x64_20ms.yaml new file mode 100644 index 000000000000..2c98c210eb0e --- /dev/null +++ b/examples/asr/conf/marblenet/marblenet_3x2x64_20ms.yaml @@ -0,0 +1,209 @@ +name: &name "MarbleNet-3x2x64" + +model: + sample_rate: 16000 + repeat: 2 + dropout: 0.0 + kernel_size_factor: 1.0 + + labels: ['0', '1'] + + train_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + labels: ${model.labels} + batch_size: 128 + shuffle: True + # tarred datasets + is_tarred: false + tarred_audio_filepaths: null + tarred_shard_strategy: "scatter" + shuffle_n: 2048 + num_workers: 8 + pin_memory: true + # bucketing params + bucketing_strategy: "synced_randomized" + bucketing_batch_size: null + bucketing_weights: null + augmentor: + white_noise: + prob: 0.9 + min_level: -90 + max_level: -46 + gain: + prob: 0.5 + min_gain_dbfs: -10.0 + max_gain_dbfs: 10.0 + noise: + prob: 0.6 + manifest_path: /manifests/vad_noise/freesound_nonspeech_train_FL200.json + min_snr_db: 0 + max_snr_db: 20 + max_gain_db: 300.0 + + validation_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + labels: ${model.labels} + batch_size: 128 + shuffle: False + num_workers: 8 + pin_memory: true + val_loss_idx: 0 + + test_ds: + manifest_filepath: null + sample_rate: ${model.sample_rate} + labels: ${model.labels} + batch_size: 128 + shuffle: False + num_workers: 8 + pin_memory: true + test_loss_idx: 0 + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + normalize: "None" + window_size: 0.025 + sample_rate: ${model.sample_rate} + window_stride: 0.01 + window: "hann" + features: &n_mels 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + stft_conv: false + pad_to: 2 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConvASREncoder + feat_in: *n_mels + activation: relu + conv_mask: true + + jasper: + - filters: 128 + repeat: 1 + kernel: [11] + stride: [2] + dilation: [1] + dropout: ${model.dropout} + residual: false + separable: true + kernel_size_factor: ${model.kernel_size_factor} + + - filters: 64 + repeat: ${model.repeat} + kernel: [13] + stride: [1] + dilation: [1] + dropout: ${model.dropout} + residual: true + separable: true + kernel_size_factor: ${model.kernel_size_factor} + + - filters: 64 + repeat: ${model.repeat} + kernel: [15] + stride: [1] + dilation: [1] + dropout: ${model.dropout} + residual: true + separable: true + kernel_size_factor: ${model.kernel_size_factor} + + - filters: 64 + repeat: ${model.repeat} + kernel: [17] + stride: [1] + dilation: [1] + dropout: ${model.dropout} + residual: true + separable: true + kernel_size_factor: ${model.kernel_size_factor} + + - filters: 128 + repeat: 1 + kernel: [29] + stride: [1] + dilation: [2] + dropout: ${model.dropout} + residual: false + separable: true + kernel_size_factor: ${model.kernel_size_factor} + + - filters: &enc_filters 128 + repeat: 1 + kernel: [1] + stride: [1] + dilation: [1] + dropout: ${model.dropout} + residual: false + + decoder: + _target_: nemo.collections.common.parts.MultiLayerPerceptron + hidden_size: *enc_filters + num_classes: -1 + num_layers: 1 + activation: 'relu' + log_softmax: false + + optim: + name: sgd + lr: 0.01 + # optimizer arguments + weight_decay: 0.001 + momentum: 0.9 + + # scheduler setup + sched: + name: PolynomialHoldDecayAnnealing + # Scheduler params + power: 2.0 + warmup_ratio: 0.05 + hold_ratio: 0.45 + min_lr: 0.001 + last_epoch: -1 + +trainer: + devices: -1 # number of gpus, -1 to use all gpus + max_epochs: 100 + max_steps: -1 # computed at runtime if not set + num_nodes: 1 + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + enable_checkpointing: False # Provided by exp_manager + logger: False # Provided by exp_manager + log_every_n_steps: 10 # Interval of logging. + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + check_val_every_n_epoch: 1 + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + +exp_manager: + exp_dir: null + name: *name + create_tensorboard_logger: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: "val_acc_macro" + mode: "max" + save_top_k: 3 + always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints + save_best_model: true + + # you need to set these two to True to continue the training + resume_if_exists: true + resume_ignore_no_checkpoint: true + + create_wandb_logger: False + wandb_logger_kwargs: + name: null + project: null diff --git a/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml b/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml new file mode 100644 index 000000000000..8c9ef7fffaf5 --- /dev/null +++ b/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml @@ -0,0 +1,38 @@ +name: &name "vad_inference_postprocessing" + +dataset: null # Path of json file of evaluation data. Audio files should have unique names +num_workers: 12 +sample_rate: 16000 +evaluate: False # whether to get AUROC and DERs, the manifest must contains groundtruth if enabled + +prepare_manifest: + auto_split: True # whether to automatically split manifest entry by split_duration to avoid potential CUDA out of memory issue. + split_duration: 400 # try smaller number if you still have CUDA memory issue + +vad: + model_path: "vad_multilingual_frame_marblenet" #.nemo local model path or pretrained model name or none + use_rttm: True # set True to output as RTTM format + parameters: # Parameters not tuned on large datasets, please use default parameters with caution + normalize_audio_db: null # set to non null value to normalize RMS DB of audio before preprocessing + window_length_in_sec: 0.0 # window length in sec for VAD context input, must be 0 for frame-VAD + shift_length_in_sec: 0.02 # frame-length in seconds for frame-VAD + smoothing: False # Deprecated for Frame-VAD. false or type of smoothing method (eg: median, mean) + overlap: 0.875 # Deprecated for Frame-VAD. overlap ratio for overlapped mean/median smoothing filter. If smoothing=False, ignore this value. + postprocessing: + onset: 0.3 # onset threshold for detecting the beginning and end of a speech + offset: 0.3 # offset threshold for detecting the end of a speech. + pad_onset: 0.5 # adding durations before each speech segment + pad_offset: 0.5 # adding durations after each speech segment + min_duration_on: 0.0 # threshold for short speech deletion + min_duration_off: 0.6 # threshold for short non-speech segment deletion + filter_speech_first: True + +prepared_manifest_vad_input: null # if not specify, it will automatically generated be "manifest_vad_input.json" +frame_out_dir: "vad_frame_outputs" +smoothing_out_dir: null # if not specify, it will automatically generated be frame_out_dir + "/overlap_smoothing_output" + "_" + smoothing_method + "_" + str(overlap) +rttm_out_dir: null # if not specify, it will automatically be frame_out_dir + "/seg_output_" + key and value in postprocessing params +out_manifest_filepath: null # if not specify it will automatically be "manifest_vad_out.json" + + +# json manifest line example +# {"audio_filepath": "/path/to/audio_file.wav", "offset": 0, "duration": 1.23, "label": "infer", "text": "-"} diff --git a/examples/asr/speech_classification/frame_vad_infer.py b/examples/asr/speech_classification/frame_vad_infer.py new file mode 100644 index 000000000000..9c8e57b0773d --- /dev/null +++ b/examples/asr/speech_classification/frame_vad_infer.py @@ -0,0 +1,188 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script peforms VAD on each 20ms frames of the input audio files. +Postprocessing is also performed to generate speech segments and store them as RTTM files. +Long audio files will be splitted into smaller chunks to avoid OOM issues, but the frames close +to the split points might have worse performance due to truncated context. + +## Usage: +python frame_vad_infer.py \ + --config-path="../conf/vad" --config-name="frame_vad_infer_postprocess" \ + dataset= +""" + +import os +from pathlib import Path + +import torch + +from nemo.collections.asr.parts.utils.manifest_utils import write_manifest +from nemo.collections.asr.parts.utils.vad_utils import ( + frame_vad_eval_detection_error, + frame_vad_infer_load_manifest, + generate_overlap_vad_seq, + generate_vad_frame_pred, + generate_vad_segment_table, + init_frame_vad_model, + prepare_manifest, +) +from nemo.core.config import hydra_runner +from nemo.utils import logging + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +@hydra_runner(config_path="../conf/vad", config_name="frame_vad_infer_postprocess") +def main(cfg): + if not cfg.dataset: + raise ValueError("You must input the path of json file of evaluation data") + + # each line of dataset should be have different audio_filepath and unique name to simplify edge cases or conditions + logging.info(f"Loading manifest file {cfg.dataset}") + manifest_orig, key_labels_map, key_rttm_map = frame_vad_infer_load_manifest(cfg) + + # Prepare manifest for streaming VAD + manifest_vad_input = cfg.dataset + if cfg.prepare_manifest.auto_split: + logging.info("Split long audio file to avoid CUDA memory issue") + logging.debug("Try smaller split_duration if you still have CUDA memory issue") + config = { + 'input': manifest_vad_input, + 'window_length_in_sec': cfg.vad.parameters.window_length_in_sec, + 'split_duration': cfg.prepare_manifest.split_duration, + 'num_workers': cfg.num_workers, + 'prepared_manifest_vad_input': cfg.prepared_manifest_vad_input, + } + manifest_vad_input = prepare_manifest(config) + else: + logging.warning( + "If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it." + ) + + torch.set_grad_enabled(False) + vad_model = init_frame_vad_model(cfg.vad.model_path) + + # setup_test_data + vad_model.setup_test_data( + test_data_config={ + 'batch_size': 1, + 'sample_rate': 16000, + 'manifest_filepath': manifest_vad_input, + 'labels': ['infer'], + 'num_workers': cfg.num_workers, + 'shuffle': False, + 'normalize_audio_db': cfg.vad.parameters.normalize_audio_db, + } + ) + + vad_model = vad_model.to(device) + vad_model.eval() + + if not os.path.exists(cfg.frame_out_dir): + logging.info(f"Frame predictions do not exist at {cfg.frame_out_dir}, generating frame prediction.") + os.mkdir(cfg.frame_out_dir) + extract_frame_preds = True + else: + logging.info(f"Frame predictions already exist at {cfg.frame_out_dir}, skipping frame prediction generation.") + extract_frame_preds = False + + if extract_frame_preds: + logging.info("Generating frame-level prediction ") + pred_dir = generate_vad_frame_pred( + vad_model=vad_model, + window_length_in_sec=cfg.vad.parameters.window_length_in_sec, + shift_length_in_sec=cfg.vad.parameters.shift_length_in_sec, + manifest_vad_input=manifest_vad_input, + out_dir=cfg.frame_out_dir, + ) + logging.info(f"Finish generating VAD frame level prediction. You can find the prediction in {pred_dir}") + else: + pred_dir = cfg.frame_out_dir + + frame_length_in_sec = cfg.vad.parameters.shift_length_in_sec + + # overlap smoothing filter + if cfg.vad.parameters.smoothing: + # Generate predictions with overlapping input segments. Then a smoothing filter is applied to decide the label for a frame spanned by multiple segments. + # smoothing_method would be either in majority vote (median) or average (mean) + logging.info("Generating predictions with overlapping input segments") + smoothing_pred_dir = generate_overlap_vad_seq( + frame_pred_dir=pred_dir, + smoothing_method=cfg.vad.parameters.smoothing, + overlap=cfg.vad.parameters.overlap, + window_length_in_sec=cfg.vad.parameters.window_length_in_sec, + shift_length_in_sec=cfg.vad.parameters.shift_length_in_sec, + num_workers=cfg.num_workers, + out_dir=cfg.smoothing_out_dir, + ) + logging.info( + f"Finish generating predictions with overlapping input segments with smoothing_method={cfg.vad.parameters.smoothing} and overlap={cfg.vad.parameters.overlap}" + ) + pred_dir = smoothing_pred_dir + + # postprocessing and generate speech segments + logging.info("Converting frame level prediction to RTTM files.") + rttm_out_dir = generate_vad_segment_table( + vad_pred_dir=pred_dir, + postprocessing_params=cfg.vad.parameters.postprocessing, + frame_length_in_sec=frame_length_in_sec, + num_workers=cfg.num_workers, + use_rttm=cfg.vad.use_rttm, + out_dir=cfg.rttm_out_dir, + ) + logging.info( + f"Finish generating speech semgents table with postprocessing_params: {cfg.vad.parameters.postprocessing}" + ) + + logging.info("Writing VAD output to manifest") + key_pred_rttm_map = {} + manifest_new = [] + for entry in manifest_orig: + key = Path(entry['audio_filepath']).stem + entry['rttm_filepath'] = Path(os.path.join(rttm_out_dir, key + ".rttm")).absolute().as_posix() + if not Path(entry['rttm_filepath']).is_file(): + logging.warning(f"Not able to find {entry['rttm_filepath']} for {entry['audio_filepath']}") + entry['rttm_filepath'] = "" + manifest_new.append(entry) + key_pred_rttm_map[key] = entry['rttm_filepath'] + + if not cfg.out_manifest_filepath: + out_manifest_filepath = "manifest_vad_output.json" + else: + out_manifest_filepath = cfg.out_manifest_filepath + write_manifest(out_manifest_filepath, manifest_new) + logging.info(f"Finished writing VAD output to manifest: {out_manifest_filepath}") + + if cfg.get("evaluate", False): + logging.info("Evaluating VAD results") + auroc, report = frame_vad_eval_detection_error( + pred_dir=pred_dir, + key_labels_map=key_labels_map, + key_rttm_map=key_rttm_map, + key_pred_rttm_map=key_pred_rttm_map, + frame_length_in_sec=frame_length_in_sec, + ) + DetER = report.iloc[[-1]][('detection error rate', '%')].item() + FA = report.iloc[[-1]][('false alarm', '%')].item() + MISS = report.iloc[[-1]][('miss', '%')].item() + logging.info(f"AUROC: {auroc:.4f}") + logging.info(f"DetER={DetER:0.4f}, False Alarm={FA:0.4f}, Miss={MISS:0.4f}") + logging.info(f"with params: {cfg.vad.parameters.postprocessing}") + logging.info("Done!") + + +if __name__ == "__main__": + main() # pylint: disable=no-value-for-parameter diff --git a/examples/asr/speech_classification/speech_to_frame_label.py b/examples/asr/speech_classification/speech_to_frame_label.py new file mode 100644 index 000000000000..04cc77afda44 --- /dev/null +++ b/examples/asr/speech_classification/speech_to_frame_label.py @@ -0,0 +1,61 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The script trains a model that peforms classification on each frame of the input audio. +The default config (i.e., marblenet_3x2x64_20ms.yaml) outputs 20ms frames. + +## Training +```sh +python speech_to_label.py \ + --config-path= + --config-name= \ + model.train_ds.manifest_filepath="" \ + model.validation_ds.manifest_filepath=["",""] \ + trainer.devices=2 \ + trainer.accelerator="gpu" \ + strategy="ddp" \ + trainer.max_epochs=200 +``` +""" + +import pytorch_lightning as pl +from omegaconf import OmegaConf +from nemo.collections.asr.models.classification_models import EncDecFrameClassificationModel + +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager + + +@hydra_runner(config_path="../conf/marblenet", config_name="marblenet_3x2x64_20ms") +def main(cfg): + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + trainer = pl.Trainer(**cfg.trainer) + exp_manager(trainer, cfg.get("exp_manager", None)) + model = EncDecFrameClassificationModel(cfg=cfg.model, trainer=trainer) + + # Initialize the weights of the model from another model, if provided via config + model.maybe_init_from_pretrained_checkpoint(cfg) + + trainer.fit(model) + + if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: + if model.prepare_test(trainer): + trainer.test(model) + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter diff --git a/nemo/collections/asr/data/audio_to_label.py b/nemo/collections/asr/data/audio_to_label.py index 7585e4d7ea4f..f00f961b4c81 100644 --- a/nemo/collections/asr/data/audio_to_label.py +++ b/nemo/collections/asr/data/audio_to_label.py @@ -505,7 +505,7 @@ class _TarredAudioLabelDataset(IterableDataset): - `replicate`: Optional shard strategy, where each node gets all of the set of shards available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. The benefit of replication is that it allows each node to sample data points from the entire - dataset independently of other nodes, and reduces dependence on value of `shuffle_n`. + dataset independently of other nodes, and reduces dependence on the value of `shuffle_n`. .. warning:: Replicated strategy allows every node to sample the entire set of available tarfiles, @@ -894,9 +894,8 @@ class AudioToMultiLabelDataset(Dataset): Defaults to False. cal_labels_occurrence (bool): Whether to calculate occurrence of labels Defaults to False. - delimiter (Optional[str]): Delimiter to use when spliting the label string, default to None. - normalize_audio_db (bool): Whether to normalize audio signal to a target db, default to False. - normalize_audio_db_target (float): Target db to normalize audio signal, default to -20. + delimiter (Optional[str]): Delimiter to use when splitting the label string, default to None. + normalize_audio_db (Optional[float]): normalize audio signal to a target db, default to None. """ @property @@ -942,8 +941,7 @@ def __init__( is_regression_task: bool = False, cal_labels_occurrence: Optional[bool] = False, delimiter: Optional[str] = None, - normalize_audio_db: bool = False, - normalize_audio_db_target: float = -20.0, + normalize_audio_db: Optional[float] = None, ): super().__init__() if isinstance(manifest_filepath, str): @@ -951,7 +949,6 @@ def __init__( self.delimiter = delimiter self.normalize_audio_db = normalize_audio_db - self.normalize_audio_db_target = normalize_audio_db_target self.collection = collections.ASRSpeechLabel( manifests_files=manifest_filepath, @@ -1022,7 +1019,6 @@ def __getitem__(self, index): duration=sample.duration, trim=self.trim, normalize_db=self.normalize_audio_db, - normalize_db_target=self.normalize_audio_db_target, ) f, fl = features, torch.tensor(features.size(0)).long() @@ -1104,9 +1100,8 @@ class TarredAudioToMultiLabelDataset(IterableDataset): or test datasets. global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. world_size (int): Total number of processes, used for partitioning shards. Defaults to 0. - delimiter (Optional[str]): Delimiter to use when spliting the label string, default to None. - normalize_audio_db (bool): Whether to normalize audio signal to a target db, default to False. - normalize_audio_db_target (float): Target db to normalize audio signal, default to -20. + delimiter (Optional[str]): Delimiter to use when splitting the label string, default to None. + normalize_audio_db (Optional[float]): normalize audio signal to a target db, default to None. """ def __init__( @@ -1127,8 +1122,7 @@ def __init__( global_rank: int = 0, world_size: int = 0, delimiter: Optional[str] = None, - normalize_audio_db: bool = False, - normalize_audio_db_target: float = -20.0, + normalize_audio_db: Optional[float] = None, ): super().__init__() if isinstance(manifest_filepath, str): @@ -1138,7 +1132,6 @@ def __init__( self.is_regression_task = is_regression_task self.delimiter = delimiter self.normalize_audio_db = normalize_audio_db - self.normalize_audio_db_target = normalize_audio_db_target self.collection = collections.ASRSpeechLabel( manifests_files=manifest_filepath, @@ -1278,7 +1271,6 @@ def _build_sample(self, tup): duration=manifest_entry.duration, trim=self.trim, normalize_db=self.normalize_audio_db, - normalize_db_target=self.normalize_audio_db_target, ) audio_filestream.close() diff --git a/nemo/collections/asr/data/audio_to_label_dataset.py b/nemo/collections/asr/data/audio_to_label_dataset.py index a242308d4042..dcead6df94b8 100644 --- a/nemo/collections/asr/data/audio_to_label_dataset.py +++ b/nemo/collections/asr/data/audio_to_label_dataset.py @@ -240,8 +240,7 @@ def get_audio_multi_label_dataset(cfg: DictConfig) -> audio_to_label.AudioToMult is_regression_task=cfg.get("is_regression_task", False), cal_labels_occurrence=cfg.get("cal_labels_occurrence", False), delimiter=cfg.get("delimiter", None), - normalize_audio_db=cfg.get("normalize_audio_db", False), - normalize_audio_db_target=cfg.get("normalize_audio_db_target", -20), + normalize_audio_db=cfg.get("normalize_audio_db", None), ) return dataset @@ -294,8 +293,7 @@ def get_tarred_audio_multi_label_dataset( shard_strategy=cfg.get('tarred_shard_strategy', 'scatter'), global_rank=global_rank, world_size=world_size, - normalize_audio_db=cfg.get("normalize_audio_db", False), - normalize_audio_db_target=cfg.get("normalize_audio_db_target", -20), + normalize_audio_db=cfg.get("normalize_audio_db", None), ) if bucketing_weights: diff --git a/nemo/collections/asr/models/classification_models.py b/nemo/collections/asr/models/classification_models.py index a7b55e49d754..fb0ee82132a1 100644 --- a/nemo/collections/asr/models/classification_models.py +++ b/nemo/collections/asr/models/classification_models.py @@ -845,6 +845,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.num_classes = len(cfg.labels) self.eval_loop_cnt = 0 + self.ratio_threshold = cfg.get('ratio_threshold', 0.2) super().__init__(cfg=cfg, trainer=trainer) @classmethod @@ -1063,8 +1064,9 @@ def reshape_labels(self, logits, labels, logits_len, labels_len): Reshape labels to match logits shape. For example, each label is expected to cover a 40ms frame, while each frme prediction from the model covers 20ms. If labels are shorter than logits, labels are repeated, otherwise labels are folded and argmax is applied to obtain the label of each frame. When lengths of labels and logits are not factors of each other, labels are truncated or padded with zeros. - The threshold 0.2 is used to determine whether to pad or truncate labels, where the value 0.2 is not important as in real cases the ratio - is very close to either ceil(ratio) or floor(ratio). We use 0.2 here for easier unit-testing. + The ratio_threshold=0.2 is used to determine whether to pad or truncate labels, where the value 0.2 is not important as in real cases the ratio + is very close to either ceil(ratio) or floor(ratio). We use 0.2 here for easier unit-testing. This implementation does not allow frame length + and label length that are not multiples of each other. Args: logits: logits tensor with shape [B, T1, C] labels: labels tensor with shape [B, T2] @@ -1080,7 +1082,7 @@ def reshape_labels(self, logits, labels, logits_len, labels_len): if logits_max_len < labels_max_len: ratio = labels_max_len // logits_max_len res = labels_max_len % logits_max_len - if ceil(ratio) - ratio < 0.2: # e.g., ratio is 1.99 + if ceil(ratio) - ratio < self.ratio_threshold: # e.g., ratio is 1.99 # pad labels with zeros until labels_max_len is a multiple of logits_max_len labels = labels.cpu().tolist() if len(labels) % ceil(ratio) != 0: @@ -1101,7 +1103,7 @@ def reshape_labels(self, logits, labels, logits_len, labels_len): elif logits_max_len > labels_max_len: ratio = logits_max_len / labels_max_len res = logits_max_len % labels_max_len - if ceil(ratio) - ratio < 0.2: # e.g., ratio is 1.99 + if ceil(ratio) - ratio < self.ratio_threshold: # e.g., ratio is 1.99 # repeat labels for ceil(ratio) times, and DROP additional labels based on logits_max_len labels = labels.repeat_interleave(ceil(ratio), dim=1).long() labels = labels[:, :logits_max_len] diff --git a/nemo/collections/asr/parts/preprocessing/features.py b/nemo/collections/asr/parts/preprocessing/features.py index c2e84b04e981..531cd3105c04 100644 --- a/nemo/collections/asr/parts/preprocessing/features.py +++ b/nemo/collections/asr/parts/preprocessing/features.py @@ -181,8 +181,7 @@ def process( trim_hop_length=512, orig_sr=None, channel_selector=None, - normalize_db=False, - normalize_db_target=-20.0, + normalize_db=None, ): audio = AudioSegment.from_file( file_path, @@ -198,7 +197,6 @@ def process( orig_sr=orig_sr, channel_selector=channel_selector, normalize_db=normalize_db, - normalize_db_target=normalize_db_target, ) return self.process_segment(audio) diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py index af6034f9af3a..89458ff4c4f6 100644 --- a/nemo/collections/asr/parts/preprocessing/segment.py +++ b/nemo/collections/asr/parts/preprocessing/segment.py @@ -36,6 +36,7 @@ import math import os import random +from typing import Optional import librosa import numpy as np @@ -78,8 +79,8 @@ def __init__( trim_hop_length=512, orig_sr=None, channel_selector=None, - normalize_db=False, - normalize_db_target=-20.0, + normalize_db: Optional[float] = None, + ref_channel: Optional[int] = None, ): """Create audio segment from samples. Samples are convert float32 internally, with int scaled to [-1, 1]. @@ -114,8 +115,11 @@ def __init__( self._samples = samples self._sample_rate = sample_rate self._orig_sr = orig_sr if orig_sr is not None else sample_rate - if normalize_db: - self.normalize_db(normalize_db_target) + self._ref_channel = ref_channel + self._normalize_db = normalize_db + + if normalize_db is not None: + self.normalize_db(normalize_db, ref_channel) def __eq__(self, other): """Return whether two objects are equal.""" @@ -185,8 +189,8 @@ def from_file( trim_hop_length=512, orig_sr=None, channel_selector=None, - normalize_db=False, - normalize_db_target=-20.0, + normalize_db=None, + ref_channel=None, ): """ Load a file supported by librosa and return as an AudioSegment. @@ -207,8 +211,8 @@ def from_file( :param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable of integers denoting a subset of channels. Channel selector is using zero-based indexing. If set to `None`, the original signal will be used. - :param normalize_db (bool): if true, normalize the audio signal to a target RMS value - :param normalize_db_target (float): the target RMS value in decibels + :param normalize_db (Optional[float]): if not None, normalize the audio signal to a target RMS value + :param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio, set None to use max RMS across channels :return: AudioSegment instance """ samples = None @@ -226,6 +230,8 @@ def from_file( trim_hop_length=trim_hop_length, orig_sr=orig_sr, channel_selector=channel_selector, + normalize_db=normalize_db, + ref_channel=ref_channel, ) if not isinstance(audio_file, str) or os.path.splitext(audio_file)[-1] in sf_supported_formats: @@ -283,7 +289,7 @@ def from_file( orig_sr=orig_sr, channel_selector=channel_selector, normalize_db=normalize_db, - normalize_db_target=normalize_db_target, + ref_channel=ref_channel, ) @classmethod @@ -464,10 +470,14 @@ def orig_sr(self): def gain_db(self, gain): self._samples *= 10.0 ** (gain / 20.0) - def normalize_db(self, target_db=-20): + def normalize_db(self, target_db=-20, ref_channel=None): """Normalize the signal to a target RMS value in decibels. + For multi-channel audio, the RMS value is determined by the reference channel (if not None), + otherwise it will be the maximum RMS across all channels. """ rms_db = self.rms_db + if self.num_channels > 1: + rms_db = max(rms_db) if ref_channel is None else rms_db[ref_channel] gain = target_db - rms_db self.gain_db(gain) diff --git a/nemo/collections/asr/parts/utils/vad_utils.py b/nemo/collections/asr/parts/utils/vad_utils.py index d35d5466a523..addf3cae29b7 100644 --- a/nemo/collections/asr/parts/utils/vad_utils.py +++ b/nemo/collections/asr/parts/utils/vad_utils.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import glob import json import math @@ -18,8 +19,9 @@ import os import shutil from itertools import repeat +from math import ceil, floor from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union import IPython.display as ipd import librosa @@ -27,12 +29,15 @@ import numpy as np import pandas as pd import torch +from omegaconf import DictConfig from pyannote.core import Annotation, Segment from pyannote.metrics import detection +from sklearn.metrics import roc_auc_score from sklearn.model_selection import ParameterGrid from tqdm import tqdm -from nemo.collections.asr.models import EncDecClassificationModel +from nemo.collections.asr.models import EncDecClassificationModel, EncDecFrameClassificationModel +from nemo.collections.common.parts.preprocessing.manifest import get_full_path from nemo.utils import logging try: @@ -78,6 +83,7 @@ def prepare_manifest(config: dict) -> str: 'label': 'infer', 'split_duration': config['split_duration'], 'window_length_in_sec': config['window_length_in_sec'], + 'manifest_dir': Path(config['input']).parent if type(config['input']) == str else '', } if config.get('num_workers') is not None and config['num_workers'] > 1: @@ -138,6 +144,12 @@ def write_vad_infer_manifest(file: dict, args_func: dict) -> list: in_duration = file.get('duration', None) in_offset = file.get('offset', 0) + # if filepath is not found, try to find it in the dir of manifest + if not Path(filepath).is_file(): + new_filepath = Path(args_func['manifest_dir']) / filepath + if new_filepath.is_file(): + filepath = new_filepath.absolute().as_posix() + try: sr = 16000 x, _sr = librosa.load(filepath, sr=sr, offset=in_offset, duration=in_duration) @@ -692,7 +704,12 @@ def generate_vad_segment_table_per_file(pred_filepath: str, per_args: dict) -> s def generate_vad_segment_table( - vad_pred_dir: str, postprocessing_params: dict, frame_length_in_sec: float, num_workers: int, out_dir: str = None, + vad_pred_dir: str, + postprocessing_params: dict, + frame_length_in_sec: float, + num_workers: int, + out_dir: str = None, + use_rttm: bool = False, ) -> str: """ Convert frame level prediction to speech segment in start and end times format. @@ -706,27 +723,26 @@ def generate_vad_segment_table( out_dir (str): output dir of generated table/csv file. num_workers(float): number of process for multiprocessing Returns: - table_out_dir(str): directory of the generated table. + out_dir(str): directory of the generated table. """ suffixes = ("frame", "mean", "median") vad_pred_filepath_list = [os.path.join(vad_pred_dir, x) for x in os.listdir(vad_pred_dir) if x.endswith(suffixes)] - if out_dir: - table_out_dir = out_dir - else: - table_out_dir_name = "table_output_tmp_" + if not out_dir: + out_dir_name = "seg_output_" for key in postprocessing_params: - table_out_dir_name = table_out_dir_name + str(key) + str(postprocessing_params[key]) + "_" + out_dir_name = out_dir_name + str(key) + str(postprocessing_params[key]) + "-" - table_out_dir = os.path.join(vad_pred_dir, table_out_dir_name) + out_dir = os.path.join(vad_pred_dir, out_dir_name) - if not os.path.exists(table_out_dir): - os.mkdir(table_out_dir) + if not os.path.exists(out_dir): + os.mkdir(out_dir) per_args = { "frame_length_in_sec": frame_length_in_sec, - "out_dir": table_out_dir, + "out_dir": out_dir, + "use_rttm": use_rttm, } per_args = {**per_args, **postprocessing_params} num_workers = None @@ -741,12 +757,11 @@ def generate_vad_segment_table( leave=True, ) ) - else: for vad_pred_filepath in tqdm(vad_pred_filepath_list, desc='creating speech segments', leave=True): generate_vad_segment_table_per_file(vad_pred_filepath, per_args) - return table_out_dir + return out_dir def generate_vad_segment_table_per_file_star(args): @@ -955,33 +970,50 @@ def pred_rttm_map(vad_pred: str, groundtruth_RTTM: str, vad_pred_method: str = " def plot( path2audio_file: str, - path2_vad_pred: str, - path2ground_truth_label: str = None, + path2_vad_pred: Optional[str] = None, + path2groundtruth_rttm: Optional[str] = None, + groundtruth_labels: Optional[str] = None, + sample_rate: int = 16000, offset: float = 0, duration: float = None, threshold: float = None, per_args: dict = None, + unit_frame_len: float = 0.01, + label_repeat: int = 1, + xticks_step: int = 5, ) -> ipd.Audio: """ - Plot VAD outputs for demonstration in tutorial + Plot Audio and/or VAD output and/or groundtruth labels for visualization Args: path2audio_file (str): path to audio file. path2_vad_pred (str): path to vad prediction file, - path2ground_truth_label(str): path to groundtruth label file. + path2groundtruth_rttm(str): path to groundtruth RTTM file. + ground_truth_labels(str): a list of groundtruth label. + sample_rate (int): sample rate of audio file. + offset (float): offset in seconds. + duration (float): duration in seconds. threshold (float): threshold for prediction score (from 0 to 1). per_args(dict): a dict that stores the thresholds for postprocessing. + unit_frame_len (float): unit frame length in seconds for VAD predictions. + label_repeat (int): repeat the label for this number of times to match different frame lengths in preds and labels. + xticks_step (int): step size for xticks. """ plt.figure(figsize=[20, 2]) - UNIT_FRAME_LEN = 0.01 - audio, sample_rate = librosa.load(path=path2audio_file, sr=16000, mono=True, offset=offset, duration=duration) + audio, sample_rate = librosa.load( + path=path2audio_file, sr=sample_rate, mono=True, offset=offset, duration=duration + ) dur = librosa.get_duration(y=audio, sr=sample_rate) - time = np.arange(offset, offset + dur, UNIT_FRAME_LEN) - frame, _ = load_tensor_from_file(path2_vad_pred) - frame_snippet = frame[int(offset / UNIT_FRAME_LEN) : int((offset + dur) / UNIT_FRAME_LEN)] + time = np.arange(offset, offset + dur, unit_frame_len) + len_pred = int(dur / unit_frame_len) + 1 + + frame_snippet = None + if path2_vad_pred: + frame, _ = load_tensor_from_file(path2_vad_pred) + frame_snippet = frame[int(offset / unit_frame_len) : int((offset + dur) / unit_frame_len)] + len_pred = len(frame_snippet) - len_pred = len(frame_snippet) ax1 = plt.subplot() ax1.plot(np.arange(audio.size) / sample_rate, audio, 'gray') ax1.set_xlim([0, int(dur) + 1]) @@ -995,27 +1027,41 @@ def plot( if not threshold and not per_args: raise ValueError("One and only one of threshold and per_args must have been used!") - if threshold: + if threshold and frame_snippet is not None: pred_snippet = np.where(frame_snippet >= threshold, 1, 0) - if per_args: + elif per_args and frame_snippet is not None: _, per_args_float = prepare_gen_segment_table( frame, per_args ) # take whole frame here for calculating onset and offset speech_segments = generate_vad_segment_table_per_tensor(frame, per_args_float) pred = gen_pred_from_speech_segments(speech_segments, frame) - pred_snippet = pred[int(offset / UNIT_FRAME_LEN) : int((offset + dur) / UNIT_FRAME_LEN)] + pred_snippet = pred[int(offset / unit_frame_len) : int((offset + dur) / unit_frame_len)] + else: + pred_snippet = None + + if path2groundtruth_rttm and path2groundtruth_rttm.endswith('.rttm'): + label = extract_labels(path2groundtruth_rttm, time) + elif groundtruth_labels: + label = [float(x) for x in groundtruth_labels] + if label_repeat > 1: + label = np.repeat(label, label_repeat) + label = label[int(offset / unit_frame_len) : int((offset + dur) / unit_frame_len)] + else: + label = None - if path2ground_truth_label: - label = extract_labels(path2ground_truth_label, time) - ax2.plot(np.arange(len_pred) * UNIT_FRAME_LEN, label, 'r', label='label') + if label: + ax2.plot(np.arange(len_pred) * unit_frame_len, label, 'r', label='label') + if pred_snippet: + ax2.plot(np.arange(len_pred) * unit_frame_len, pred_snippet, 'b', label='pred') + if frame_snippet: + ax2.plot(np.arange(len_pred) * unit_frame_len, frame_snippet, 'g--', label='speech prob') - ax2.plot(np.arange(len_pred) * UNIT_FRAME_LEN, pred_snippet, 'b', label='pred') - ax2.plot(np.arange(len_pred) * UNIT_FRAME_LEN, frame_snippet, 'g--', label='speech prob') ax2.tick_params(axis='y', labelcolor='r') ax2.legend(loc='lower right', shadow=True) ax2.set_ylabel('Preds and Probas') ax2.set_ylim([-0.1, 1.1]) - return ipd.Audio(audio, rate=16000) + ax2.set_xticks(np.arange(0, int(dur) + 1, xticks_step)) + return ipd.Audio(audio, rate=sample_rate) def gen_pred_from_speech_segments( @@ -1038,11 +1084,11 @@ def gen_pred_from_speech_segments( def extract_labels(path2ground_truth_label: str, time: list) -> list: """ Extract ground-truth label for given time period. - path2ground_truth_label (str): path of groundtruth label file + path2ground_truth_label (str): path of groundtruth RTTM file time (list) : a list of array representing time period. """ - data = pd.read_csv(path2ground_truth_label, sep=" ", delimiter=None, header=None) + data = pd.read_csv(path2ground_truth_label, sep="\s+", delimiter=None, header=None) data = data.rename(columns={3: "start", 4: "dur", 7: "speaker"}) labels = [] for pos in time: @@ -1086,9 +1132,14 @@ def generate_vad_frame_pred( else: log_probs = vad_model(input_signal=test_batch[0], input_signal_length=test_batch[1]) probs = torch.softmax(log_probs, dim=-1) + if len(probs.shape) == 3 and probs.shape[0] == 1: + # squeeze the batch dimension, since batch size is 1 for frame-VAD + probs = probs.squeeze(0) # [1,T,C] -> [T,C] pred = probs[:, 1] - if status[i] == 'start': + if window_length_in_sec == 0: + to_save = pred + elif status[i] == 'start': to_save = pred[:-trunc] elif status[i] == 'next': to_save = pred[trunc:-trunc_l] @@ -1097,6 +1148,7 @@ def generate_vad_frame_pred( else: to_save = pred + to_save = to_save.cpu().tolist() all_len += len(to_save) outpath = os.path.join(out_dir, data[i] + ".frame") with open(outpath, "a", encoding='utf-8') as fout: @@ -1125,6 +1177,21 @@ def init_vad_model(model_path: str): return vad_model +def init_frame_vad_model(model_path: str): + """ + Initiate VAD model with model path + """ + if model_path.endswith('.nemo'): + logging.info(f"Using local VAD model from {model_path}") + vad_model = EncDecFrameClassificationModel.restore_from(restore_path=model_path) + elif model_path.endswith('.ckpt'): + vad_model = EncDecFrameClassificationModel.load_from_checkpoint(checkpoint_path=model_path) + else: + logging.info(f"Using NGC cloud VAD model {model_path}") + vad_model = EncDecFrameClassificationModel.from_pretrained(model_name=model_path) + return vad_model + + def stitch_segmented_asr_output( segmented_output_manifest: str, speech_segments_tensor_dir: str = "speech_segments", @@ -1238,32 +1305,6 @@ def construct_manifest_eval( return aligned_vad_asr_output_manifest -def extract_audio_features(vad_model: EncDecClassificationModel, manifest_vad_input: str, out_dir: str) -> str: - """ - Extract audio features and write to out_dir - """ - - file_list = [] - with open(manifest_vad_input, 'r', encoding='utf-8') as fin: - for line in fin.readlines(): - file_list.append(Path(json.loads(line)['audio_filepath']).stem) - - logging.info(f"Extracting features on {len(file_list)} audio files/json lines!") - - for i, test_batch in enumerate(tqdm(vad_model.test_dataloader(), total=len(vad_model.test_dataloader()))): - test_batch = [x.to(vad_model.device) for x in test_batch] - with autocast(): - processed_signal, processed_signal_length = vad_model.preprocessor( - input_signal=test_batch[0], length=test_batch[1], - ) - processed_signal = processed_signal.squeeze(0)[:, :processed_signal_length] - processed_signal = processed_signal.cpu() - outpath = os.path.join(out_dir, file_list[i] + ".pt") - torch.save(processed_signal, outpath) - del test_batch - return out_dir - - def load_rttm_file(filepath: str) -> pd.DataFrame: """ Load rttm file and extract speech segments @@ -1321,7 +1362,7 @@ def load_speech_overlap_segments_from_rttm(rttm_file: str) -> Tuple[List[List[fl Returns: merged (List[List[float]]): merged speech intervals without overlaps - overlaps (List[List[float]]): intervals without overlap speech + overlaps (List[List[float]]): intervals with overlap speech """ speech_segments = list(load_rttm_file(rttm_file)['segment']) speech_segments = [list(x) for x in speech_segments] @@ -1367,7 +1408,9 @@ def get_nonspeech_segments( return nonspeech_segments -def get_frame_labels(segments: List[List[float]], frame_length: float, offset: float, duration: float) -> str: +def get_frame_labels( + segments: List[List[float]], frame_length: float, offset: float, duration: float, as_str: bool = True +) -> str: """ Generate frame-level binary labels for audio, '0' for non-speech and '1' for speech @@ -1379,30 +1422,39 @@ def get_frame_labels(segments: List[List[float]], frame_length: float, offset: f """ labels = [] n_frames = int(np.ceil(duration / frame_length)) - sid = 0 for i in range(n_frames): t = offset + i * frame_length while sid < len(segments) - 1 and segments[sid][1] < t: sid += 1 - if segments[sid][0] <= t <= segments[sid][1]: - labels.append('1') + if segments[sid][1] != 0 and segments[sid][0] <= t <= segments[sid][1]: + labels.append(1) else: - labels.append('0') - return ' '.join(labels) + labels.append(0) + if as_str: + return ' '.join([str(x) for x in labels]) + return [float(x) for x in labels] def plot_sample_from_rttm( - audio_file: str, rttm_file: str, max_duration: Optional[float] = None, save_path: str = "", show: bool = True + audio_file: str, + rttm_file: str, + max_duration: Optional[float] = None, + save_path: str = "", + show: bool = True, + offset: float = 0.0, + unit_frame_len: float = 0.01, ): + """ + Plot audio signal and frame-level labels from RTTM file + """ plt.figure(figsize=[20, 2]) - UNIT_FRAME_LEN = 0.01 - audio, sample_rate = librosa.load(path=audio_file, sr=16000, mono=True, offset=0, duration=max_duration) + audio, sample_rate = librosa.load(path=audio_file, sr=16000, mono=True, offset=offset, duration=max_duration) dur = librosa.get_duration(y=audio, sr=sample_rate) segments = load_speech_segments_from_rttm(rttm_file) - labels = get_frame_labels(segments, UNIT_FRAME_LEN, 0.0, dur) + labels = get_frame_labels(segments, unit_frame_len, offset, dur) labels = [float(x) for x in labels.split()] length = len(labels) @@ -1415,7 +1467,7 @@ def plot_sample_from_rttm( ax1.set_ylim([-1, 1]) ax2 = ax1.twinx() - ax2.plot(np.arange(length) * UNIT_FRAME_LEN, labels, 'r', label='label') + ax2.plot(np.arange(length) * unit_frame_len, labels, 'r', label='label') ax2.tick_params(axis='y', labelcolor='r') ax2.legend(loc='lower right', shadow=True) ax2.set_ylabel('Labels') @@ -1425,3 +1477,240 @@ def plot_sample_from_rttm( if save_path: plt.savefig(save_path) return ipd.Audio(audio, rate=16000) + + +def align_labels_to_frames(probs, labels, threshold=0.2): + """ + Aligns labels to frames when the frame length (e.g., 10ms) is different from the label length (e.g., 20ms). + The threshold 0.2 is not important, since the actual ratio will always be close to an integer unless using frame/label + lengths that are not multiples of each other (e.g., 15ms frame length and 20ms label length), which is not valid. + The value 0.2 here is just for easier unit testing. + Args: + probs (List[float]): list of probabilities + labels (List[int]): list of labels + threshold (float): threshold for rounding ratio to integer + Returns: + labels (List[int]): list of labels aligned to frames + """ + frames_len = len(probs) + labels_len = len(labels) + probs = torch.tensor(probs).float() + labels = torch.tensor(labels).long() + + if frames_len < labels_len: + # pad labels with zeros until labels_len is a multiple of frames_len + ratio = labels_len / frames_len + res = labels_len % frames_len + if ( + ceil(ratio) - ratio < threshold + ): # e.g., ratio = 2.9, ceil(ratio) = 3, then we pad labels to make it a multiple of 3 + # pad labels with zeros until labels_max_len is a multiple of logits_max_len + labels = labels.tolist() + if len(labels) % ceil(ratio) != 0: + labels += [0] * (ceil(ratio) - len(labels) % ceil(ratio)) + labels = torch.tensor(labels).long() + labels = labels.view(-1, ceil(ratio)).amax(1) + return align_labels_to_frames(probs.tolist(), labels.long().tolist()) + # otherwise, truncate additional labels until labels_max_len is a multiple of logits_max_len + if res > 0: + labels = labels[:-res] + labels = labels.view(-1, floor(ratio)).amax(1) + return labels.long().tolist() + elif frames_len > labels_len: + # repeat labels until labels_len is a multiple of frames_len + ratio = frames_len / labels_len + res = frames_len % labels_len + if ceil(ratio) - ratio < threshold: + # e.g., ratio is 1.83, ceil(ratio) = 2, then we repeat labels to make it a multiple of 2, and discard the redundant labels + labels = labels.repeat_interleave(ceil(ratio), dim=0).long().tolist() + labels = labels[:frames_len] + else: + # e.g., ratio is 2.02, floor(ratio) = 2, then we repeat labels to make it a multiple of 2 and add additional labels + labels = labels.repeat_interleave(floor(ratio), dim=0).long().tolist() + if res > 0: + labels += labels[-res:] + return labels + else: + return labels.long().tolist() + + +def read_rttm_as_pyannote_object(rttm_file: str, speaker_override: Optional[str] = None) -> Annotation: + """ + Read rttm file and construct a Pyannote object. + Args: + rttm_file(str) : path of rttm file. + speaker_override(str) : if not None, all speakers will be replaced by this value. + Returns: + annotation(pyannote.Annotation): annotation object + """ + annotation = Annotation() + data = pd.read_csv(rttm_file, sep="\s+", delimiter=None, header=None) + data = data.rename(columns={3: "start", 4: "dur", 7: "speaker"}) + for index, row in data.iterrows(): + if speaker_override is not None: + annotation[Segment(row['start'], row['start'] + row['dur'])] = speaker_override + else: + annotation[Segment(row['start'], row['start'] + row['dur'])] = row['speaker'] + return annotation + + +def convert_labels_to_speech_segments(labels: List[float], frame_length_in_sec: float = 0.01): + """ + Convert a list of labels to a list of speech segments. + Args: + labels (List[float]): list of labels + frame_length_in_sec (float): frame length in seconds + Returns: + segments (List[Tuple[float, float]]): list of speech segments + """ + segments = [] + start = -1 + for i, label in enumerate(labels): + if label == 1: + if start == -1: + start = i * frame_length_in_sec + else: + if start > -1: + segments.append([start, (i - 1) * frame_length_in_sec]) + start = -1 + if start != -1: + segments.append([start, (len(labels) - 1) * frame_length_in_sec]) + return segments + + +def frame_vad_construct_pyannote_object_per_file( + prediction: Union[str, List[float]], groundtruth: Union[str, List[float]], frame_length_in_sec: float = 0.01 +) -> Tuple[Annotation, Annotation]: + """ + Construct a Pyannote object for evaluation. + Args: + prediction (str) : path of VAD predictions stored as RTTM or CSV-like txt. + groundtruth (str): path of groundtruth rttm file. + frame_length_in_sec(float): frame length in seconds + Returns: + reference(pyannote.Annotation): groundtruth + hypothesis(pyannote.Annotation): prediction + """ + + hypothesis = Annotation() + if isinstance(groundtruth, str) and prediction.endswith('.rttm'): + hypothesis = read_rttm_as_pyannote_object(prediction, speaker_override='speech') + elif isinstance(groundtruth, str) and prediction.endswith('.txt'): + pred = pd.read_csv(prediction, sep=" ", header=None) + for index, row in pred.iterrows(): + hypothesis[Segment(float(row[0]), float(row[0]) + float(row[1]))] = 'speech' + elif isinstance(groundtruth, list): + segments = convert_labels_to_speech_segments(prediction, frame_length_in_sec) + for segment in segments: + hypothesis[Segment(segment[0], segment[1])] = 'speech' + else: + raise ValueError('prediction must be a path to rttm file or a list of frame labels.') + + reference = Annotation() + if isinstance(groundtruth, str) and groundtruth.endswith('.rttm'): + reference = read_rttm_as_pyannote_object(groundtruth, speaker_override='speech') + elif isinstance(groundtruth, list): + segments = convert_labels_to_speech_segments(groundtruth, frame_length_in_sec) + for segment in segments: + reference[Segment(segment[0], segment[1])] = 'speech' + else: + raise ValueError('groundtruth must be a path to rttm file or a list of frame labels.') + return reference, hypothesis + + +def frame_vad_infer_load_manifest(cfg: DictConfig): + """ + Load manifest file and prepare label/rttm mapping + Args: + cfg: config file + Returns: + manifest_orig (List[Dict]): original manifest data + key_labels_map (Dict): mapping from unique_audio_name to its labels + key_rttm_map (Dict): mapping from unique_audio_name to its rttm file + """ + unique_audio_names = set() + key_labels_map = {} + key_rttm_map = {} + manifest_orig = [] + manifest_file = Path(cfg.dataset).absolute().as_posix() + with open(manifest_file, 'r') as fin: + for line in fin.readlines(): + entry = json.loads(line.strip()) + audio_filepath = get_full_path(audio_file=entry['audio_filepath'], manifest_file=manifest_file) + entry['audio_filepath'] = str(audio_filepath) + uniq_audio_name = Path(audio_filepath).stem + + if uniq_audio_name in unique_audio_names: + raise ValueError("Please make sure each line is with different audio_filepath! ") + else: + unique_audio_names.add(uniq_audio_name) + + manifest_orig.append(entry) + + # always prefer RTTM labels if exist + if "label" not in entry or "rttm_filepath" in entry or "rttm_file" in entry: + rttm_key = "rttm_filepath" if "rttm_filepath" in entry else "rttm_file" + segments = load_speech_segments_from_rttm(entry[rttm_key]) + label_str = get_frame_labels( + segments=segments, + frame_length=cfg.vad.parameters.shift_length_in_sec, + duration=entry['duration'], + offset=entry['offset'], + ) + key_rttm_map[uniq_audio_name] = entry[rttm_key] + key_labels_map[uniq_audio_name] = [float(x) for x in label_str.split()] + elif entry.get("label", None) is not None: + key_labels_map[uniq_audio_name] = [float(x) for x in entry["label"].split()] + else: + raise ValueError("Must have either `label` or `rttm_filepath` in manifest") + + return manifest_orig, key_labels_map, key_rttm_map + + +def frame_vad_eval_detection_error( + pred_dir: str, key_labels_map: dict, key_rttm_map: dict, key_pred_rttm_map: dict, frame_length_in_sec: float +): + """ + Perform evaluation on frame-VAD results + Args: + pred_dir: directory of frame-VAD prediction files with in `.frame` format + key_labels_map: dictionary of mapping each to its labels + key_rttm_map: dictionary of mapping each to its GROUNDTRUTH rttm file + key_pred_rttm_map: dictionary of mapping each to its PREDICTED rttm file + frame_length_in_sec: frame length in seconds, e.g. 0.02s + Returns: + auroc: AUROC score in 0~100% + report: Pyannote detection.DetectionErrorRate() report + """ + all_probs = [] + all_labels = [] + metric = detection.DetectionErrorRate() + key_probs_map = {} + predictions_list = list(Path(pred_dir).glob("*.frame")) + for frame_pred in tqdm(predictions_list, desc="Evaluating VAD results", total=len(predictions_list)): + pred_probs = [] + with frame_pred.open("r") as fin: + for line in fin.readlines(): + line = line.strip() + if not line: + continue + pred_probs.append(float(line)) + key = frame_pred.stem + key_probs_map[key] = pred_probs + key_labels_map[key] = align_labels_to_frames(probs=pred_probs, labels=key_labels_map[key]) + all_probs.extend(key_probs_map[key]) + all_labels.extend(key_labels_map[key]) + + if key in key_rttm_map: + groundtruth = key_rttm_map[key] + else: + groundtruth = key_labels_map[key] + + reference, hypothesis = frame_vad_construct_pyannote_object_per_file( + prediction=key_pred_rttm_map[key], groundtruth=groundtruth, frame_length_in_sec=frame_length_in_sec, + ) + metric(reference, hypothesis) + + auroc = roc_auc_score(y_true=all_labels, y_score=all_probs) + report = metric.report(display=False) + return auroc, report diff --git a/tests/collections/asr/test_asr_classification_model.py b/tests/collections/asr/test_asr_classification_model.py index 44125de92b3d..876bb6073a38 100644 --- a/tests/collections/asr/test_asr_classification_model.py +++ b/tests/collections/asr/test_asr_classification_model.py @@ -255,52 +255,13 @@ def test_EncDecClassificationDatasetConfig_for_AudioToSpeechLabelDataset(self): class TestEncDecFrameClassificationModel(TestEncDecClassificationModel): + @pytest.mark.parametrize(["logits_len", "labels_len"], [(20, 10), (21, 10), (19, 10), (20, 9), (20, 11)]) @pytest.mark.unit - def test_reshape_labels(self, frame_classification_model): + def test_reshape_labels(self, frame_classification_model, logits_len, labels_len): model = frame_classification_model.eval() - logits = torch.ones(4, 20, 2) - labels = torch.ones(4, 10) - logits_len = torch.tensor([6, 7, 8, 9]) - labels_len = torch.tensor([5, 6, 7, 8]) - labels_new, labels_len_new = model.reshape_labels( - logits=logits, labels=labels, logits_len=logits_len, labels_len=labels_len - ) - assert labels_new.size(1) == logits.size(1) - assert torch.equal(labels_len_new, torch.tensor([6, 7, 8, 9])) - - logits = torch.ones(4, 21, 2) - labels = torch.ones(4, 10) - logits_len = torch.tensor([6, 7, 8, 9]) - labels_len = torch.tensor([5, 6, 7, 8]) - labels_new, labels_len_new = model.reshape_labels( - logits=logits, labels=labels, logits_len=logits_len, labels_len=labels_len - ) - assert labels_new.size(1) == logits.size(1) - assert torch.equal(labels_len_new, torch.tensor([6, 7, 8, 9])) - - logits = torch.ones(4, 19, 2) - labels = torch.ones(4, 10) - logits_len = torch.tensor([6, 7, 8, 9]) - labels_len = torch.tensor([5, 6, 7, 8]) - labels_new, labels_len_new = model.reshape_labels( - logits=logits, labels=labels, logits_len=logits_len, labels_len=labels_len - ) - assert labels_new.size(1) == logits.size(1) - assert torch.equal(labels_len_new, torch.tensor([6, 7, 8, 9])) - - logits = torch.ones(4, 20, 2) - labels = torch.ones(4, 9) - logits_len = torch.tensor([6, 7, 8, 9]) - labels_len = torch.tensor([5, 6, 7, 8]) - labels_new, labels_len_new = model.reshape_labels( - logits=logits, labels=labels, logits_len=logits_len, labels_len=labels_len - ) - assert labels_new.size(1) == logits.size(1) - assert torch.equal(labels_len_new, torch.tensor([6, 7, 8, 9])) - - logits = torch.ones(4, 20, 2) - labels = torch.ones(4, 11) + logits = torch.ones(4, logits_len, 2) + labels = torch.ones(4, labels_len) logits_len = torch.tensor([6, 7, 8, 9]) labels_len = torch.tensor([5, 6, 7, 8]) labels_new, labels_len_new = model.reshape_labels( diff --git a/tests/collections/asr/utils/test_vad_utils.py b/tests/collections/asr/utils/test_vad_utils.py new file mode 100644 index 000000000000..a7672e1aa43d --- /dev/null +++ b/tests/collections/asr/utils/test_vad_utils.py @@ -0,0 +1,126 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from pyannote.core import Annotation, Segment + +from nemo.collections.asr.parts.utils.vad_utils import ( + align_labels_to_frames, + convert_labels_to_speech_segments, + frame_vad_construct_pyannote_object_per_file, + get_frame_labels, + get_nonspeech_segments, + load_speech_overlap_segments_from_rttm, + load_speech_segments_from_rttm, + read_rttm_as_pyannote_object, +) + + +def get_simple_rttm_without_overlap(rttm_file="test1.rttm"): + line = "SPEAKER 1 0 2 speech \n" + speech_segments = [[0.0, 2.0]] + with open(rttm_file, "w") as f: + f.write(line) + return rttm_file, speech_segments + + +def get_simple_rttm_with_overlap(rttm_file="test2.rttm"): + speech_segments = [[0.0, 3.0]] + overlap_segments = [[1.0, 2.0]] + with open(rttm_file, "w") as f: + f.write("SPEAKER 1 0 2 speech \n") + f.write("SPEAKER 1 1 2 speech \n") + return rttm_file, speech_segments, overlap_segments + + +def get_simple_rttm_with_silence(rttm_file="test3.rttm"): + line = "SPEAKER 1 1 2 speech \n" + speech_segments = [[1.0, 2.0]] + silence_segments = [[0.0, 1.0]] + with open(rttm_file, "w") as f: + f.write(line) + return rttm_file, speech_segments, silence_segments + + +class TestVADUtils: + @pytest.mark.parametrize(["logits_len", "labels_len"], [(20, 10), (20, 11), (20, 9), (10, 21), (10, 19)]) + @pytest.mark.unit + def test_align_label_logits(self, logits_len, labels_len): + logits = np.arange(logits_len).tolist() + labels = np.arange(labels_len).tolist() + labels_new = align_labels_to_frames(probs=logits, labels=labels) + + assert len(labels_new) == len(logits) + + @pytest.mark.unit + def test_load_speech_segments_from_rttm(self, test_data_dir): + rttm_file, speech_segments = get_simple_rttm_without_overlap(test_data_dir + "/test1.rttm") + speech_segments_new = load_speech_segments_from_rttm(rttm_file) + assert speech_segments_new == speech_segments + + @pytest.mark.unit + def test_load_speech_overlap_segments_from_rttm(self, test_data_dir): + rttm_file, speech_segments, overlap_segments = get_simple_rttm_with_overlap(test_data_dir + "/test2.rttm") + speech_segments_new, overlap_segments_new = load_speech_overlap_segments_from_rttm(rttm_file) + assert speech_segments_new == speech_segments + assert overlap_segments_new == overlap_segments + + @pytest.mark.unit + def test_get_nonspeech_segments(self, test_data_dir): + rttm_file, speech_segments, silence_segments = get_simple_rttm_with_silence(test_data_dir + "/test3.rttm") + speech_segments_new = load_speech_segments_from_rttm(rttm_file) + silence_segments_new = get_nonspeech_segments(speech_segments_new) + assert silence_segments_new == silence_segments + + @pytest.mark.unit + def test_get_frame_labels(self, test_data_dir): + rttm_file, speech_segments = get_simple_rttm_without_overlap(test_data_dir + "/test4.rttm") + speech_segments_new = load_speech_segments_from_rttm(rttm_file) + frame_labels = get_frame_labels(speech_segments_new, 0.02, 0.0, 3.0, as_str=False) + assert frame_labels[0] == 1 + assert len(frame_labels) == 150 + + @pytest.mark.unit + def test_convert_labels_to_speech_segments(self, test_data_dir): + rttm_file, speech_segments = get_simple_rttm_without_overlap(test_data_dir + "/test5.rttm") + speech_segments_new = load_speech_segments_from_rttm(rttm_file) + frame_labels = get_frame_labels(speech_segments_new, 0.02, 0.0, 3.0, as_str=False) + speech_segments_new = convert_labels_to_speech_segments(frame_labels, 0.02) + assert speech_segments_new == speech_segments + + @pytest.mark.unit + def test_read_rttm_as_pyannote_object(self, test_data_dir): + rttm_file, speech_segments = get_simple_rttm_without_overlap(test_data_dir + "/test6.rttm") + pyannote_object = read_rttm_as_pyannote_object(rttm_file) + pyannote_object_gt = Annotation() + pyannote_object_gt[Segment(0.0, 2.0)] = 'speech' + assert pyannote_object == pyannote_object_gt + + @pytest.mark.unit + def test_frame_vad_construct_pyannote_object_per_file(self, test_data_dir): + rttm_file, speech_segments = get_simple_rttm_without_overlap(test_data_dir + "/test7.rttm") + # test for rttm input + ref, hyp = frame_vad_construct_pyannote_object_per_file(rttm_file, rttm_file) + pyannote_object_gt = Annotation() + pyannote_object_gt[Segment(0.0, 2.0)] = 'speech' + assert ref == hyp == pyannote_object_gt + + # test for list input + speech_segments = load_speech_segments_from_rttm(rttm_file) + frame_labels = get_frame_labels(speech_segments, 0.02, 0.0, 3.0, as_str=False) + speech_segments_new = convert_labels_to_speech_segments(frame_labels, 0.02) + assert speech_segments_new == speech_segments + ref, hyp = frame_vad_construct_pyannote_object_per_file(frame_labels, frame_labels, 0.02) + assert ref == hyp == pyannote_object_gt From cfbe0924db02a1a557aeeea55cc1a01b83903e71 Mon Sep 17 00:00:00 2001 From: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Date: Thu, 1 Jun 2023 17:39:14 -0700 Subject: [PATCH 003/123] [TTS][zh] refine hardcoded lowercase for ASCII letters. (#6781) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --- .../text_to_speech/tokenizer_utils.py | 7 +- .../text_to_speech/tts_tokenizers.py | 30 +++---- .../tts/g2p/models/zh_cn_pinyin.py | 82 +++++++++++++------ 3 files changed, 77 insertions(+), 42 deletions(-) diff --git a/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py b/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py index 2644e487d585..92a3e0fb49e0 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py +++ b/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py @@ -185,10 +185,9 @@ def any_locale_word_tokenize(text: str) -> List[Tuple[List[str], bool]]: return _word_tokenize(words) -# TODO @xueyang: deprecate language-specific text preprocessing and use any_locale_text_preprocessing. -def spanish_text_preprocessing(text): +def spanish_text_preprocessing(text: str) -> str: return text.lower() -def chinese_text_preprocessing(text): - return text.lower() +def chinese_text_preprocessing(text: str) -> str: + return text diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py index 21f352d64710..abcbdb1661b9 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py +++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py @@ -692,7 +692,7 @@ def __init__( sep='|', # To be able to distinguish between 2/3 letters codes. add_blank_at=None, pad_with_space=False, - text_preprocessing_func=lambda text: chinese_text_preprocessing(text), + text_preprocessing_func=chinese_text_preprocessing, ): """Chinese phoneme-based tokenizer. Args: @@ -716,12 +716,15 @@ def __init__( if silence is not None: self.silence, tokens = len(tokens), tokens + [silence] # Silence - self.phonemes_list = g2p.phonemes_list - self.tones_list = g2p.tones_list + self.phoneme_list = g2p.phoneme_list + self.tone_list = g2p.tone_list + self.ascii_letter_list = g2p.ascii_letter_list - tokens.extend(self.phonemes_list) - tokens.extend(self.tones_list) - tokens.extend(string.ascii_lowercase) + tokens.extend(self.phoneme_list) + tokens.extend(self.tone_list) + tokens.extend(self.ascii_letter_list) + + self.text_preprocessing_func = text_preprocessing_func if apostrophe: tokens.append("'") # Apostrophe @@ -737,15 +740,12 @@ def __init__( self.punct = punct self.pad_with_space = pad_with_space - - self.text_preprocessing_func = text_preprocessing_func self.g2p = g2p - def encode(self, text): + def encode(self, text: str) -> List[int]: """See base class for more information.""" - text = self.text_preprocessing_func(text) - g2p_text = self.g2p(text) # TODO: handle infer + g2p_text = self.g2p(text) return self.encode_from_g2p(g2p_text, text) def encode_from_g2p(self, g2p_text: List[str], raw_text: Optional[str] = None): @@ -762,15 +762,15 @@ def encode_from_g2p(self, g2p_text: List[str], raw_text: Optional[str] = None): # Add space if last one isn't one if p == space and len(ps) > 0 and ps[-1] != space: ps.append(p) - # Add next phoneme or char (if chars=True) - elif (p.isalnum() or p == "'" or p in self.phonemes_list or p in self.tones_list) and p in tokens: + # Add next phoneme or tone or ascii letter or apostrophe. + elif (p.isalnum() or p == "'" or p in self.phoneme_list + self.tone_list + self.ascii_letter_list) and p in tokens: ps.append(p) - # Add punct + # Add punctuation elif (p in self.PUNCT_LIST) and self.punct: ps.append(p) # Warn about unknown char/phoneme elif p != space: - message = f"Text: [{''.join(g2p_text)}] contains unknown char/phoneme: [{p}]." + message = f"Text: [{' '.join(g2p_text)}] contains unknown char/phoneme: [{p}]." if raw_text is not None: message += f"Original text: [{raw_text}]. Symbol will be skipped." logging.warning(message) diff --git a/nemo/collections/tts/g2p/models/zh_cn_pinyin.py b/nemo/collections/tts/g2p/models/zh_cn_pinyin.py index 73bcbec5a414..35a22f6ba118 100644 --- a/nemo/collections/tts/g2p/models/zh_cn_pinyin.py +++ b/nemo/collections/tts/g2p/models/zh_cn_pinyin.py @@ -16,7 +16,9 @@ from collections import defaultdict from typing import Dict, List, Optional, Union +from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import get_grapheme_character_set from nemo.collections.tts.g2p.models.base import BaseG2p +from nemo.collections.tts.g2p.utils import set_grapheme_case from nemo.utils import logging @@ -25,25 +27,38 @@ def __init__( self, phoneme_dict: Union[str, pathlib.Path, Dict[str, List[str]]], phoneme_prefix: str = "#", + phoneme_case: str = "upper", tone_prefix: str = "#", + ascii_letter_prefix: str = "", + ascii_letter_case: str = "lower", word_tokenize_func=None, apply_to_oov_word=None, mapping_file: Optional[str] = None, word_segmenter: Optional[str] = None, ): - """Chinese G2P module. This module first converts Chinese characters into pinyin sequences using pypinyin, then pinyin sequences would - be further converted into phoneme sequences using pinyin_dict_nv_22.10.txt dict file. For Chinese and English bilingual sentences, the English words - would be converted into letters. + """ + Chinese G2P module. This module first converts Chinese characters into pinyin sequences using pypinyin, then + pinyin sequences would be further converted into phoneme sequences by looking them up in the `phoneme_dict`. + This G2P module also works with Chinese/English bilingual sentences where English words would be converted + into letters. It is advised to attach prefix symbols for Chinese phonemes and tones to discriminate them + from English letters to avoid any potential symbol set overlaps. Args: phoneme_dict (str, Path, Dict): Path to pinyin_dict_nv_22.10.txt dict file or a dict object. phoneme_prefix (str): Prepend a special symbol to any phonemes in order to distinguish phonemes from graphemes because there may be overlaps between the two sets. Phoneme dictionary typically applies uppercase initials and finals. It is suggested to choose a prefix that is not used or preserved somewhere else. Default to "#". + phoneme_case (str): Specify the case chosen from `"lower"`, `"upper"`, or `"mixed"`, and process the + cases of Chinese phonemes. Default to `"upper"`. tone_prefix (str): Prepend a special symbol to any tone digits. Default to "#". + ascii_letter_prefix (str): Prepend a special symbol to any ASCII letters. Default to "". + ascii_letter_case (str): Specify the case chosen from `"lower"`, `"upper"`, or `"mixed"`, and process the + cases of non-Chinese words. Default to `"lower"`. word_tokenize_func: Function for tokenizing text to words. - It has to return List[Tuple[Union[str, List[str]], bool]] where every tuple denotes word representation and flag whether to leave unchanged or not. - It is expected that unchangeable word representation will be represented as List[str], other cases are represented as str. + It has to return List[Tuple[Union[str, List[str]], bool]] where every tuple denotes word representation + and flag whether to leave unchanged or not. + It is expected that unchangeable word representation will be represented as List[str], other cases are + represented as str. It is useful to mark word as unchangeable which is already in phoneme representation. apply_to_oov_word: Function that will be applied to out of phoneme_dict word. word_segmenter: method that will be applied to segment utterances into words for better polyphone disambiguation. @@ -58,13 +73,27 @@ def __init__( phoneme_prefix = "" if tone_prefix is None: tone_prefix = "" + if ascii_letter_prefix is None: + ascii_letter_prefix = "" + # phonemes phoneme_dict = ( - self._parse_as_pinyin_dict(phoneme_dict, phoneme_prefix) + self._parse_as_pinyin_dict(phoneme_dict, phoneme_prefix, phoneme_case) if isinstance(phoneme_dict, str) or isinstance(phoneme_dict, pathlib.Path) else phoneme_dict ) - self.phonemes_list = list({pron for prons in phoneme_dict.values() for pron in prons}) + self.phoneme_list = list({pron for prons in phoneme_dict.values() for pron in prons}) + + # tones + self.tone_dict = {str(x): tone_prefix + str(x) for x in range(1, 6)} + self.tone_list = list(self.tone_dict.values()) + + # ascii letters + self.ascii_letter_dict = { + x: ascii_letter_prefix + x for x in get_grapheme_character_set(locale="en-US", case=ascii_letter_case) + } + self.ascii_letter_list = sorted(self.ascii_letter_dict) + self.ascii_letter_case = ascii_letter_case if apply_to_oov_word is None: logging.warning( @@ -81,9 +110,6 @@ def __init__( mapping_file=mapping_file, ) - self.tones = {str(x): tone_prefix + str(x) for x in range(1, 6)} - self.tones_list = list(self.tones.values()) - if word_segmenter == "jieba": try: import jieba @@ -109,7 +135,7 @@ def __init__( @staticmethod def _parse_as_pinyin_dict( - phoneme_dict_path: Union[str, pathlib.Path], phoneme_prefix: str + phoneme_dict_path: Union[str, pathlib.Path], phoneme_prefix: str, phoneme_case: str ) -> Dict[str, List[str]]: """Loads pinyin dict file, and generates a set of all valid symbols.""" g2p_dict = defaultdict(list) @@ -120,11 +146,13 @@ def _parse_as_pinyin_dict( continue parts = line.split('\t') - # lowercase the Chinese syllables because pypinyin requires lowercase inputs. + # Convert the cases of Chinese syllables loaded from the dictionary to lowercase to match the lowercase + # Chinese syllable outputs generated by the function `pypinyin.lazy_pinyin`. Note that the function + # `pypinyin.lazy_pinyin` preserves the cases of ASCII letters. syllable = parts[0].lower() - pronunciation = parts[1].split() + pronunciation = set_grapheme_case(parts[1], case=phoneme_case).split() - # add phoneme prefix to distinguish from other symbols. + # add a prefix to distinguish phoneme symbols from non-phoneme symbols. pronunciation_with_prefix = [phoneme_prefix + pron for pron in pronunciation] g2p_dict[syllable] = pronunciation_with_prefix @@ -132,14 +160,19 @@ def _parse_as_pinyin_dict( def __call__(self, text: str) -> List[str]: """ - errors func handle below is to process the bilingual situation, - where English words would be split into letters. - e.g. 我今天去了Apple Store, 买了一个iPhone。 - would return a list - ['wo3', 'jin1', 'tian1', 'qu4', 'le5', 'A', 'p', 'p', 'l', 'e', - ' ', 'S', 't', 'o', 'r', 'e', ',', ' ', 'mai3', 'le5', 'yi2', - 'ge4', 'i', 'P', 'h', 'o', 'n', 'e', '。'] + This forward pass function translates Chinese characters into pinyin sequences and then converts the pinyin + into phonemes. It is primarily designed to process texts containing with Chinese characters, but we have + extended its support to handle texts that include both Chinese and English. This extension was mainly + necessitated by the limited availability of bilingual datasets. The `errors` argument used in the + `pypinyin.lazy_pinyin` function below is used to process non-Chinese words, where each English word is split + into letters. + + For example, The text "我今天去了Apple Store, 买了一个iPhone。" would be converted as a list, + `['wo3', 'jin1', 'tian1', 'qu4', 'le5', 'A', 'p', 'p', 'l', 'e', ' ', 'S', 't', 'o', 'r', 'e', ',', ' ', 'mai3', + 'le5', 'yi2', 'ge4', 'i', 'P', 'h', 'o', 'n', 'e', '。']` """ + text = set_grapheme_case(text, case=self.ascii_letter_case) + pinyin_seq = [] words_list = self.word_segmenter(text) @@ -154,15 +187,18 @@ def __call__(self, text: str) -> List[str]: ) phoneme_seq = [] for pinyin in pinyin_seq: + # only pinyin has tones while non-pinyin doesn't. tone_hyp = pinyin[-1] - if tone_hyp in self.tones: + if tone_hyp in self.tone_dict: syllable = pinyin[:-1] assert syllable in self.phoneme_dict, f"Syllable <{syllable}> does not exist in the dictionary." phoneme_seq += self.phoneme_dict[syllable] - phoneme_seq.append(self.tones[tone_hyp]) + phoneme_seq.append(self.tone_dict[tone_hyp]) # All pinyin would end up with a number in 1-5, which represents tones of the pinyin. # For symbols which are not pinyin, such as English letters and Chinese punctuations, we directly # use them as inputs. + elif tone_hyp in self.ascii_letter_dict: + phoneme_seq.append(self.ascii_letter_dict[tone_hyp]) else: phoneme_seq.append(pinyin) return phoneme_seq From 5428a97e3c9578b79fa7b30b6c53f2ae9759f418 Mon Sep 17 00:00:00 2001 From: bene-ges Date: Fri, 2 Jun 2023 18:44:18 +0300 Subject: [PATCH 004/123] Spellchecking ASR customization model (#6179) * bug fixes Signed-off-by: Alexandra Antonova * fix bugs, add preparation and evaluation scripts, add readme Signed-off-by: Alexandra Antonova * small fixes Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add real coverage calculation, small fixes, more debug information Signed-off-by: Alexandra Antonova * add option to pass a filelist and output folder - to handle inference from multiple input files Signed-off-by: Alexandra Antonova * added preprocessing for yago wikipedia articles - finding yago entities and their subphrases Signed-off-by: Alexandra Antonova * yago wiki preprocessing, sampling, pseudonormalization Signed-off-by: Alexandra Antonova * more scripts for preparation of training examples Signed-off-by: Alexandra Antonova * bug fixes Signed-off-by: Alexandra Antonova * add some alphabet checks Signed-off-by: Alexandra Antonova * add bert on subwords, concatenate it to bert on characters Signed-off-by: Alexandra Antonova * add calculation of character_pos_to_subword_pos Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * pdb Signed-off-by: Alexandra Antonova * tensor join bug fix Signed-off-by: Alexandra Antonova * double hidden_size in classifier Signed-off-by: Alexandra Antonova * pdb Signed-off-by: Alexandra Antonova * default index value 0 instead of -1 because index cannot be negative Signed-off-by: Alexandra Antonova * pad index value 0 instead of -1 because index cannot be negative Signed-off-by: Alexandra Antonova * remove pdb Signed-off-by: Alexandra Antonova * fix bugs, add creation of tarred dataset Signed-off-by: Alexandra Antonova * add possibility to change sequence len at inference Signed-off-by: Alexandra Antonova * change sampling of dummy candidates at inference, add candidate info file Signed-off-by: Alexandra Antonova * fix import Signed-off-by: Alexandra Antonova * fix bug Signed-off-by: Alexandra Antonova * update transcription now uses info Signed-off-by: Alexandra Antonova * write path Signed-off-by: Alexandra Antonova * 1. add tarred dataset support(untested). 2. fix bug with ban_ngrams in indexing Signed-off-by: Alexandra Antonova * skip short_sent if no real candidates Signed-off-by: Alexandra Antonova * fix import Signed-off-by: Alexandra Antonova * add braceexpand Signed-off-by: Alexandra Antonova * fixes Signed-off-by: Alexandra Antonova * fix bug Signed-off-by: Alexandra Antonova * fix bug Signed-off-by: Alexandra Antonova * fix bug in np.ones Signed-off-by: Alexandra Antonova * fix bug in collate Signed-off-by: Alexandra Antonova * change tensor type to long because of error in torch.gather Signed-off-by: Alexandra Antonova * fix for empty spans tensor Signed-off-by: Alexandra Antonova * same fixes in _collate_fn for tarred dataset Signed-off-by: Alexandra Antonova * fix bug from previous commit Signed-off-by: Alexandra Antonova * change int types to be shorter to minimize tar size Signed-off-by: Alexandra Antonova * refactoring of datasets and inference Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * tar by 100k examples, small fixes Signed-off-by: Alexandra Antonova * small fixes, add analytics script Signed-off-by: Alexandra Antonova * Add functions for dynamic programming comparison to get best path by ngrams Signed-off-by: Alexandra Antonova * fixes Signed-off-by: Alexandra Antonova * small fix Signed-off-by: Alexandra Antonova * fixes to support testing on SPGISpeech Signed-off-by: Alexandra Antonova * add preprocessing for userlibri Signed-off-by: Alexandra Antonova * some refactoring Signed-off-by: Alexandra Antonova * some refactoring Signed-off-by: Alexandra Antonova * move some functions to utils to reuse from other project Signed-off-by: Alexandra Antonova * move some functions to utils to reuse from other project Signed-off-by: Alexandra Antonova * move some functions to utils to reuse from other project Signed-off-by: Alexandra Antonova * small refactoring before pr. Add bash-scripts reproducing evaluation Signed-off-by: Alexandra Antonova * style fix Signed-off-by: Alexandra Antonova * small fixes in inference Signed-off-by: Alexandra Antonova * bug fix - didn't move window on last symbol Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug - shuffle was before truncation of sorted candidates Signed-off-by: Alexandra Antonova * refactoring, fix some bugs Signed-off-by: Alexandra Antonova * variour fixes. Add word_indices at inference Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add candidate positions Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Move data preparation and evaluation to other repo Signed-off-by: Alexandra Antonova * add infer_reproduce_paper. Refactoring Signed-off-by: Alexandra Antonova * refactor inference using fragment indices Signed-off-by: Alexandra Antonova * add some helper functions Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug with parameters order Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bugs Signed-off-by: Alexandra Antonova * refactoring, fix bug Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add multiple variants of adjusting start/end positions Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * more fixes Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add unit tests, other fixes Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Alexandra Antonova * fix CodeQl warnings Signed-off-by: Alexandra Antonova * bug fixes Signed-off-by: Alexandra Antonova * fix bugs, add preparation and evaluation scripts, add readme Signed-off-by: Alexandra Antonova * small fixes Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add real coverage calculation, small fixes, more debug information Signed-off-by: Alexandra Antonova * add option to pass a filelist and output folder - to handle inference from multiple input files Signed-off-by: Alexandra Antonova * added preprocessing for yago wikipedia articles - finding yago entities and their subphrases Signed-off-by: Alexandra Antonova * yago wiki preprocessing, sampling, pseudonormalization Signed-off-by: Alexandra Antonova * more scripts for preparation of training examples Signed-off-by: Alexandra Antonova * bug fixes Signed-off-by: Alexandra Antonova * add some alphabet checks Signed-off-by: Alexandra Antonova * add bert on subwords, concatenate it to bert on characters Signed-off-by: Alexandra Antonova * add calculation of character_pos_to_subword_pos Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * pdb Signed-off-by: Alexandra Antonova * tensor join bug fix Signed-off-by: Alexandra Antonova * double hidden_size in classifier Signed-off-by: Alexandra Antonova * pdb Signed-off-by: Alexandra Antonova * default index value 0 instead of -1 because index cannot be negative Signed-off-by: Alexandra Antonova * pad index value 0 instead of -1 because index cannot be negative Signed-off-by: Alexandra Antonova * remove pdb Signed-off-by: Alexandra Antonova * fix bugs, add creation of tarred dataset Signed-off-by: Alexandra Antonova * add possibility to change sequence len at inference Signed-off-by: Alexandra Antonova * change sampling of dummy candidates at inference, add candidate info file Signed-off-by: Alexandra Antonova * fix import Signed-off-by: Alexandra Antonova * fix bug Signed-off-by: Alexandra Antonova * update transcription now uses info Signed-off-by: Alexandra Antonova * write path Signed-off-by: Alexandra Antonova * 1. add tarred dataset support(untested). 2. fix bug with ban_ngrams in indexing Signed-off-by: Alexandra Antonova * skip short_sent if no real candidates Signed-off-by: Alexandra Antonova * fix import Signed-off-by: Alexandra Antonova * add braceexpand Signed-off-by: Alexandra Antonova * fixes Signed-off-by: Alexandra Antonova * fix bug Signed-off-by: Alexandra Antonova * fix bug Signed-off-by: Alexandra Antonova * fix bug in np.ones Signed-off-by: Alexandra Antonova * fix bug in collate Signed-off-by: Alexandra Antonova * change tensor type to long because of error in torch.gather Signed-off-by: Alexandra Antonova * fix for empty spans tensor Signed-off-by: Alexandra Antonova * same fixes in _collate_fn for tarred dataset Signed-off-by: Alexandra Antonova * fix bug from previous commit Signed-off-by: Alexandra Antonova * change int types to be shorter to minimize tar size Signed-off-by: Alexandra Antonova * refactoring of datasets and inference Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * tar by 100k examples, small fixes Signed-off-by: Alexandra Antonova * small fixes, add analytics script Signed-off-by: Alexandra Antonova * Add functions for dynamic programming comparison to get best path by ngrams Signed-off-by: Alexandra Antonova * fixes Signed-off-by: Alexandra Antonova * small fix Signed-off-by: Alexandra Antonova * fixes to support testing on SPGISpeech Signed-off-by: Alexandra Antonova * add preprocessing for userlibri Signed-off-by: Alexandra Antonova * some refactoring Signed-off-by: Alexandra Antonova * some refactoring Signed-off-by: Alexandra Antonova * move some functions to utils to reuse from other project Signed-off-by: Alexandra Antonova * move some functions to utils to reuse from other project Signed-off-by: Alexandra Antonova * move some functions to utils to reuse from other project Signed-off-by: Alexandra Antonova * small refactoring before pr. Add bash-scripts reproducing evaluation Signed-off-by: Alexandra Antonova * style fix Signed-off-by: Alexandra Antonova * small fixes in inference Signed-off-by: Alexandra Antonova * bug fix - didn't move window on last symbol Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug - shuffle was before truncation of sorted candidates Signed-off-by: Alexandra Antonova * refactoring, fix some bugs Signed-off-by: Alexandra Antonova * variour fixes. Add word_indices at inference Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add candidate positions Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Move data preparation and evaluation to other repo Signed-off-by: Alexandra Antonova * add infer_reproduce_paper. Refactoring Signed-off-by: Alexandra Antonova * refactor inference using fragment indices Signed-off-by: Alexandra Antonova * add some helper functions Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug with parameters order Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bugs Signed-off-by: Alexandra Antonova * refactoring, fix bug Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add multiple variants of adjusting start/end positions Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * more fixes Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add unit tests, other fixes Signed-off-by: Alexandra Antonova * fix Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix CodeQl warnings Signed-off-by: Alexandra Antonova * add script for full inference pipeline, refactoring Signed-off-by: Alexandra Antonova * add tutorial Signed-off-by: Alexandra Antonova * take example data from HuggingFace Signed-off-by: Alexandra Antonova * add docs Signed-off-by: Alexandra Antonova * fix comment Signed-off-by: Alexandra Antonova * fix bug Signed-off-by: Alexandra Antonova * small fixes for PR Signed-off-by: Alexandra Antonova * add some more tests Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * try to fix tests adding with_downloads Signed-off-by: Alexandra Antonova * skip tests with tokenizer download Signed-off-by: Alexandra Antonova --------- Signed-off-by: Alexandra Antonova Signed-off-by: Alexandra Antonova Co-authored-by: Alexandra Antonova Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/source/nlp/models.rst | 1 + .../nlp/spellchecking_asr_customization.rst | 128 ++ docs/source/starthere/tutorials.rst | 3 + .../spellchecking_asr_customization/README.md | 32 + .../checkpoint_to_nemo.py | 38 + ...pellchecking_asr_customization_config.yaml | 97 ++ .../convert_data_to_tarred.sh | 50 + .../create_custom_vocab_index.py | 72 + .../create_tarred_dataset.py | 99 ++ .../helpers.py | 86 + .../postprocess_and_update_manifest.py | 79 + .../prepare_input_from_manifest.py | 129 ++ .../run_infer.sh | 99 ++ .../run_training.sh | 56 + .../run_training_tarred.sh | 63 + .../spellchecking_asr_customization_infer.py | 123 ++ .../spellchecking_asr_customization_train.py | 66 + .../extract_giza_alignments.py | 215 +-- .../__init__.py | 20 + .../bert_example.py | 593 +++++++ .../dataset.py | 521 ++++++ .../spellchecking_asr_customization/utils.py | 845 ++++++++++ .../text_normalization_as_tagging/utils.py | 196 +++ nemo/collections/nlp/models/__init__.py | 1 + .../__init__.py | 18 + .../spellchecking_model.py | 526 ++++++ .../spoken_wikipedia/run.sh | 2 +- .../test_spellchecking_asr_customization.py | 1102 +++++++++++++ .../ctc_segmentation/scripts/prepare_data.py | 2 +- ...pellMapper_English_ASR_Customization.ipynb | 1403 +++++++++++++++++ .../spellmapper_customization_vocabulary.png | Bin 0 -> 39243 bytes .../images/spellmapper_data_preparation.png | Bin 0 -> 75265 bytes .../images/spellmapper_inference_pipeline.png | Bin 0 -> 146148 bytes 33 files changed, 6459 insertions(+), 206 deletions(-) create mode 100644 docs/source/nlp/spellchecking_asr_customization.rst create mode 100644 examples/nlp/spellchecking_asr_customization/README.md create mode 100644 examples/nlp/spellchecking_asr_customization/checkpoint_to_nemo.py create mode 100644 examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml create mode 100644 examples/nlp/spellchecking_asr_customization/convert_data_to_tarred.sh create mode 100644 examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py create mode 100644 examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py create mode 100644 examples/nlp/spellchecking_asr_customization/helpers.py create mode 100644 examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py create mode 100644 examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py create mode 100644 examples/nlp/spellchecking_asr_customization/run_infer.sh create mode 100644 examples/nlp/spellchecking_asr_customization/run_training.sh create mode 100644 examples/nlp/spellchecking_asr_customization/run_training_tarred.sh create mode 100644 examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py create mode 100644 examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py create mode 100644 nemo/collections/nlp/data/spellchecking_asr_customization/__init__.py create mode 100644 nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py create mode 100644 nemo/collections/nlp/data/spellchecking_asr_customization/dataset.py create mode 100644 nemo/collections/nlp/data/spellchecking_asr_customization/utils.py create mode 100644 nemo/collections/nlp/models/spellchecking_asr_customization/__init__.py create mode 100644 nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py create mode 100644 tests/collections/nlp/test_spellchecking_asr_customization.py create mode 100644 tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb create mode 100644 tutorials/nlp/images/spellmapper_customization_vocabulary.png create mode 100644 tutorials/nlp/images/spellmapper_data_preparation.png create mode 100644 tutorials/nlp/images/spellmapper_inference_pipeline.png diff --git a/docs/source/nlp/models.rst b/docs/source/nlp/models.rst index 932be201bfb2..ad50d976db9f 100755 --- a/docs/source/nlp/models.rst +++ b/docs/source/nlp/models.rst @@ -9,6 +9,7 @@ NeMo's NLP collection supports provides the following task-specific models: :maxdepth: 1 punctuation_and_capitalization_models + spellchecking_asr_customization token_classification joint_intent_slot text_classification diff --git a/docs/source/nlp/spellchecking_asr_customization.rst b/docs/source/nlp/spellchecking_asr_customization.rst new file mode 100644 index 000000000000..f9009b520361 --- /dev/null +++ b/docs/source/nlp/spellchecking_asr_customization.rst @@ -0,0 +1,128 @@ +.. _spellchecking_asr_customization: + +SpellMapper (Spellchecking ASR Customization) Model +===================================================== + +SpellMapper is a non-autoregressive model for postprocessing of ASR output. It gets as input a single ASR hypothesis (text) and a custom vocabulary and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any. Unlike traditional spellchecking approaches, which aim to correct known words using language models, SpellMapper's goal is to correct highly specific user terms, out-of-vocabulary (OOV) words or spelling variations (e.g., "John Koehn", "Jon Cohen"). + +This model is an alternative to word boosting/shallow fusion approaches: + +- does not require retraining ASR model; +- does not require beam-search/language model (LM); +- can be applied on top of any English ASR model output; + +Model Architecture +------------------ +Though SpellMapper is based on `BERT `__ :cite:`nlp-ner-devlin2018bert` architecture, it uses some non-standard tricks that make it different from other BERT-based models: + +- ten separators (``[SEP]`` tokens) are used to combine the ASR hypothesis and ten candidate phrases into a single input; +- the model works on character level; +- subword embeddings are concatenated to the embeddings of each character that belongs to this subword; + + .. code:: + + Example input: [CLS] a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o [SEP] d i d i e r _ s a u m o n [SEP] a s t r o n o m i e [SEP] t r i s t a n _ g u i l l o t [SEP] ... + Input segments: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 + Example output: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 0 ... + +The model calculates logits for each character x 11 labels: + +- ``0`` - character doesn't belong to any candidate, +- ``1..10`` - character belongs to candidate with this id. + +At inference average pooling is applied to calculate replacement probability for the whole fragments. + +Quick Start Guide +----------------- + +We recommend you try this model in a Jupyter notebook (need GPU): +`NeMo/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb `__. + +A pretrained English checkpoint can be found at `HuggingFace `__. + +An example inference pipeline can be found here: `NeMo/examples/nlp/spellchecking_asr_customization/run_infer.sh `__. + +An example script on how to train the model can be found here: `NeMo/examples/nlp/spellchecking_asr_customization/run_training.sh `__. + +An example script on how to train on large datasets can be found here: `NeMo/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh `__. + +The default configuration file for the model can be found here: `NeMo/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml `__. + +.. _dataset_spellchecking_asr_customization: + +Input/Output Format at Inference stage +-------------------------------------- +Here we describe input/output format of the SpellMapper model. + +.. note:: + + If you use `inference pipeline `__ this format will be hidden inside and you only need to provide an input manifest and user vocabulary and you will get a corrected manifest. + +An input line should consist of 4 tab-separated columns: + 1. text of ASR-hypothesis + 2. texts of 10 candidates separated by semicolon + 3. 1-based ids of non-dummy candidates, separated by space + 4. approximate start/end coordinates of non-dummy candidates (correspond to ids in third column) + +Example input (in one line): + +.. code:: + + t h e _ t a r a s i c _ o o r d a _ i s _ a _ p a r t _ o f _ t h e _ a o r t a _ l o c a t e d _ i n _ t h e _ t h o r a x + h e p a t i c _ c i r r h o s i s;u r a c i l;c a r d i a c _ a r r e s t;w e a n;a p g a r;p s y c h o m o t o r;t h o r a x;t h o r a c i c _ a o r t a;a v f;b l o c k a d e d + 1 2 6 7 8 9 10 + CUSTOM 6 23;CUSTOM 4 10;CUSTOM 4 15;CUSTOM 56 62;CUSTOM 5 19;CUSTOM 28 31;CUSTOM 39 48 + +Each line in SpellMapper output is tab-separated and consists of 4 columns: + 1. ASR-hypothesis (same as in input) + 2. 10 candidates separated by semicolon (same as in input) + 3. fragment predictions, separated by semicolon, each prediction is a tuple (start, end, candidate_id, probability) + 4. letter predictions - candidate_id predicted for each letter (this is only for debug purposes) + +Example output (in one line): + +.. code:: + + t h e _ t a r a s i c _ o o r d a _ i s _ a _ p a r t _ o f _ t h e _ a o r t a _ l o c a t e d _ i n _ t h e _ t h o r a x + h e p a t i c _ c i r r h o s i s;u r a c i l;c a r d i a c _ a r r e s t;w e a n;a p g a r;p s y c h o m o t o r;t h o r a x;t h o r a c i c _ a o r t a;a v f;b l o c k a d e d + 56 62 7 0.99998;4 20 8 0.95181;12 20 8 0.44829;4 17 8 0.99464;12 17 8 0.97645 + 8 8 8 0 8 8 8 8 8 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 7 7 7 7 7 + +Training Data Format +-------------------- + +For training, the data should consist of 5 files: + +- ``config.json`` - BERT config +- ``label_map.txt`` - labels from 0 to 10, do not change +- ``semiotic_classes.txt`` - currently there are only two classes: ``PLAIN`` and ``CUSTOM``, do not change +- ``train.tsv`` - training examples +- ``test.tsv`` - validation examples + +Note that since all these examples are synthetic, we do not reserve a set for final testing. Instead, we run `inference pipeline `__ and compare resulting word error rate (WER) to the WER of baseline ASR output. + +One (non-tarred) training example should consist of 4 tab-separated columns: + 1. text of ASR-hypothesis + 2. texts of 10 candidates separated by semicolon + 3. 1-based ids of correct candidates, separated by space, or 0 if none + 4. start/end coordinates of correct candidates (correspond to ids in third column) + +Example (in one line): + +.. code:: + + a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o + d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y + 1 3 + CUSTOM 12 23;CUSTOM 28 41 + +For data preparation see `this script `__ + + +References +---------- + +.. bibliography:: nlp_all.bib + :style: plain + :labelprefix: NLP-NER + :keyprefix: nlp-ner- diff --git a/docs/source/starthere/tutorials.rst b/docs/source/starthere/tutorials.rst index cb81aecc1109..9c960053398b 100644 --- a/docs/source/starthere/tutorials.rst +++ b/docs/source/starthere/tutorials.rst @@ -130,6 +130,9 @@ To run a tutorial: * - NLP - Punctuation and Capitalization - `Punctuation and Capitalization `_ + * - NLP + - Spellchecking ASR Customization - SpellMapper + - `Spellchecking ASR Customization - SpellMapper `_ * - NLP - Entity Linking - `Entity Linking `_ diff --git a/examples/nlp/spellchecking_asr_customization/README.md b/examples/nlp/spellchecking_asr_customization/README.md new file mode 100644 index 000000000000..2d83fd8d11ad --- /dev/null +++ b/examples/nlp/spellchecking_asr_customization/README.md @@ -0,0 +1,32 @@ +# SpellMapper - spellchecking model for ASR Customization + +This model is inspired by Microsoft's paper https://arxiv.org/pdf/2203.00888.pdf, but does not repeat its implementation. +The goal is to build a model that gets as input a single ASR hypothesis (text) and a vocabulary of custom words/phrases and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any. +Our model is non-autoregressive (NAR) based on transformer architecture (BERT with multiple separators). + +As initial data we use about 5 mln entities from [YAGO corpus](https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/yago-naga/yago/downloads/). These entities are short phrases from Wikipedia headings. +In order to get misspelled predictions we feed these data to TTS model and then to ASR model. +Having a "parallel" corpus of "correct + misspelled" phrases, we use statistical machine translation techniques to create a dictionary of possible ngram mappings with their respective frequencies. +We create an auxiliary algorithm that takes as input a sentence (ASR hypothesis) and a large custom dictionary (e.g. 5000 phrases) and selects top 10 candidate phrases that are probably contained in this sentence in a misspelled way. +The task of our final neural model is to predict which fragments in the ASR hypothesis should be replaced by which of top-10 candidate phrases if any. + +The pipeline consists of multiple steps: + +1. Download or generate training data. + See `https://github.com/bene-ges/nemo_compatible/tree/main/scripts/nlp/en_spellmapper/dataset_preparation` + +2. [Optional] Convert training dataset to tarred files. + `convert_dataset_to_tarred.sh` + +3. Train spellchecking model. + `run_training.sh` + or + `run_training_tarred.sh` + +4. Run evaluation. + - [test_on_kensho.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh) + - [test_on_userlibri.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh) + - [test_on_spoken_wikipedia.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh) + +5. Run inference. + `python run_infer.sh` diff --git a/examples/nlp/spellchecking_asr_customization/checkpoint_to_nemo.py b/examples/nlp/spellchecking_asr_customization/checkpoint_to_nemo.py new file mode 100644 index 000000000000..c2f514f3e67e --- /dev/null +++ b/examples/nlp/spellchecking_asr_customization/checkpoint_to_nemo.py @@ -0,0 +1,38 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script converts checkpoint .ckpt to .nemo file. + +This script uses the `examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml` +config file by default. The other option is to set another config file via command +line arguments by `--config-name=CONFIG_FILE_PATH'. +""" + +from omegaconf import DictConfig, OmegaConf + +from nemo.collections.nlp.models import SpellcheckingAsrCustomizationModel +from nemo.core.config import hydra_runner +from nemo.utils import logging + + +@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config") +def main(cfg: DictConfig) -> None: + logging.debug(f'Config Params: {OmegaConf.to_yaml(cfg)}') + SpellcheckingAsrCustomizationModel.load_from_checkpoint(cfg.checkpoint_path).save_to(cfg.target_nemo_path) + + +if __name__ == "__main__": + main() diff --git a/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml b/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml new file mode 100644 index 000000000000..c98915cdfc6f --- /dev/null +++ b/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml @@ -0,0 +1,97 @@ +name: &name spellchecking +lang: ??? # e.g. 'ru', 'en' + +# Pretrained Nemo Models +pretrained_model: null + +trainer: + devices: 1 # the number of gpus, 0 for CPU + num_nodes: 1 + max_epochs: 3 # the number of training epochs + enable_checkpointing: false # provided by exp_manager + logger: false # provided by exp_manager + accumulate_grad_batches: 1 # accumulates grads every k batches + gradient_clip_val: 0.0 + precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + accelerator: gpu + strategy: ddp + log_every_n_steps: 1 # Interval of logging. + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + +model: + do_training: true + label_map: ??? # path/.../label_map.txt + semiotic_classes: ??? # path/.../semiotic_classes.txt + max_sequence_len: 128 + lang: ${lang} + hidden_size: 768 + + optim: + name: adamw + lr: 3e-5 + weight_decay: 0.1 + + sched: + name: WarmupAnnealing + + # pytorch lightning args + monitor: val_loss + reduce_on_plateau: false + + # scheduler config override + warmup_ratio: 0.1 + last_epoch: -1 + + language_model: + pretrained_model_name: bert-base-uncased # For ru, try DeepPavlov/rubert-base-cased | For de or multilingual, try bert-base-multilingual-cased + lm_checkpoint: null + config_file: null # json file, precedence over config + config: null + + tokenizer: + tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece + vocab_file: null # path to vocab file + tokenizer_model: null # only used if tokenizer is sentencepiece + special_tokens: null + +exp_manager: + exp_dir: nemo_experiments # where to store logs and checkpoints + name: training # name of experiment + create_tensorboard_logger: True + create_checkpoint_callback: True + checkpoint_callback_params: + save_top_k: 3 + monitor: "val_loss" + mode: "min" + +tokenizer: + tokenizer_name: ${model.transformer} # or sentencepiece + vocab_file: null # path to vocab file + tokenizer_model: null # only used if tokenizer is sentencepiece + special_tokens: null + +# Data +data: + train_ds: + data_path: ??? # provide the full path to the file + batch_size: 8 + shuffle: true + num_workers: 3 + pin_memory: false + drop_last: false + + validation_ds: + data_path: ??? # provide the full path to the file. + batch_size: 8 + shuffle: false + num_workers: 3 + pin_memory: false + drop_last: false + + +# Inference +inference: + from_file: null # Path to the raw text, no labels required. Each sentence on a separate line + out_file: null # Path to the output file + batch_size: 16 # batch size for inference.from_file diff --git a/examples/nlp/spellchecking_asr_customization/convert_data_to_tarred.sh b/examples/nlp/spellchecking_asr_customization/convert_data_to_tarred.sh new file mode 100644 index 000000000000..d4265eb4beb6 --- /dev/null +++ b/examples/nlp/spellchecking_asr_customization/convert_data_to_tarred.sh @@ -0,0 +1,50 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Path to NeMo repository +NEMO_PATH=NeMo + +DATA_PATH="data_folder" + +## data_folder_example +## ├── tarred_data +## | └── (output) +## ├── config.json +##   ├── label_map.txt +##   ├── semiotic_classes.txt +## ├── test.tsv +## ├── 1.tsv +## ├── ... +## └── 200.tsv + +## Each of {1-200}.tsv input files are 110'000 examples subsets of all.tsv (except for validation part), +## generated by https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/dataset_preparation/build_training_data.sh +## Note that in this example we use 110'000 as input and only pack 100'000 of them to tar file. +## This is because some input examples, e.g. too long, can be skipped during preprocessing, and we want all tar files to contain fixed equal number of examples. + +for part in {1..200} +do + python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py \ + lang="en" \ + data.train_ds.data_path=${DATA_PATH}/${part}.tsv \ + data.validation_ds.data_path=${DATA_PATH}/test.tsv \ + model.max_sequence_len=256 \ + model.language_model.pretrained_model_name=huawei-noah/TinyBERT_General_6L_768D \ + model.language_model.config_file=${DATA_PATH}/config.json \ + model.label_map=${DATA_PATH}/label_map.txt \ + model.semiotic_classes=${DATA_PATH}/semiotic_classes.txt \ + +output_tar_file=${DATA_PATH}/tarred_data/part${part}.tar \ + +take_first_n_lines=100000 +done diff --git a/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py b/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py new file mode 100644 index 000000000000..07d64ec5b723 --- /dev/null +++ b/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py @@ -0,0 +1,72 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script is used to create an index of custom vocabulary and save it to file. +See "examples/nlp/spellchecking_asr_customization/run_infer.sh" for the whole inference pipeline. +""" + +from argparse import ArgumentParser + +from nemo.collections.nlp.data.spellchecking_asr_customization.utils import get_index, load_ngram_mappings + +parser = ArgumentParser(description="Create an index of custom vocabulary and save it to file") + +parser.add_argument( + "--input_name", required=True, type=str, help="Path to input file with custom vocabulary (plain text)" +) +parser.add_argument( + "--ngram_mappings", required=True, type=str, help="Path to input file with n-gram mapping vocabulary" +) +parser.add_argument("--output_name", required=True, type=str, help="Path to output file with custom vocabulary index") +parser.add_argument("--min_log_prob", default=-4.0, type=float, help="Threshold on log probability") +parser.add_argument( + "--max_phrases_per_ngram", + default=500, + type=int, + help="Threshold on number of phrases that can be stored for one n-gram key in index. Keys with more phrases are discarded.", +) +parser.add_argument( + "--max_misspelled_freq", default=125000, type=int, help="Threshold on maximum frequency of misspelled n-gram" +) + +args = parser.parse_args() + +# Load custom vocabulary +custom_phrases = set() +with open(args.input_name, "r", encoding="utf-8") as f: + for line in f: + phrase = line.strip() + custom_phrases.add(" ".join(list(phrase.replace(" ", "_")))) +print("Size of customization vocabulary:", len(custom_phrases)) + +# Load n-gram mappings vocabulary +ngram_mapping_vocab, ban_ngram = load_ngram_mappings(args.ngram_mappings, max_misspelled_freq=125000) + +# Generate index of custom phrases +phrases, ngram2phrases = get_index( + custom_phrases, + ngram_mapping_vocab, + ban_ngram, + min_log_prob=args.min_log_prob, + max_phrases_per_ngram=args.max_phrases_per_ngram, +) + +# Save index to file +with open(args.output_name, "w", encoding="utf-8") as out: + for ngram in ngram2phrases: + for phrase_id, begin, size, logprob in ngram2phrases[ngram]: + phrase = phrases[phrase_id] + out.write(ngram + "\t" + phrase + "\t" + str(begin) + "\t" + str(size) + "\t" + str(logprob) + "\n") diff --git a/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py b/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py new file mode 100644 index 000000000000..d0bdc2c9bd30 --- /dev/null +++ b/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py @@ -0,0 +1,99 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script is used to create a tarred dataset for SpellcheckingAsrCustomizationModel. + +This script uses the `/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml` +config file by default. The other option is to set another config file via command +line arguments by `--config-name=CONFIG_FILE_PATH'. Probably it is worth looking +at the example config file to see the list of parameters used for training. + +USAGE Example: +1. Obtain a processed dataset +2. Run: + python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py \ + lang=${LANG} \ + data.train_ds.data_path=${DATA_PATH}/train.tsv \ + model.language_model.pretrained_model_name=${LANGUAGE_MODEL} \ + model.label_map=${DATA_PATH}/label_map.txt \ + +output_tar_file=tarred/part1.tar \ + +take_first_n_lines=100000 + +""" +import pickle +import tarfile +from io import BytesIO + +from helpers import MODEL, instantiate_model_and_trainer +from omegaconf import DictConfig, OmegaConf + +from nemo.core.config import hydra_runner +from nemo.utils import logging + + +@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config") +def main(cfg: DictConfig) -> None: + logging.info(f'Config Params: {OmegaConf.to_yaml(cfg)}') + logging.info("Start creating tar file from " + cfg.data.train_ds.data_path + " ...") + _, model = instantiate_model_and_trainer( + cfg, MODEL, True + ) # instantiate model like for training because we may not have pretrained model + dataset = model._train_dl.dataset + archive = tarfile.open(cfg.output_tar_file, mode="w") + max_lines = int(cfg.take_first_n_lines) + for i in range(len(dataset)): + if i >= max_lines: + logging.info("Reached " + str(max_lines) + " examples") + break + ( + input_ids, + input_mask, + segment_ids, + input_ids_for_subwords, + input_mask_for_subwords, + segment_ids_for_subwords, + character_pos_to_subword_pos, + labels_mask, + labels, + spans, + ) = dataset[i] + + # do not store masks as they are just arrays of 1 + content = { + "input_ids": input_ids, + "input_mask": input_mask, + "segment_ids": segment_ids, + "input_ids_for_subwords": input_ids_for_subwords, + "input_mask_for_subwords": input_mask_for_subwords, + "segment_ids_for_subwords": segment_ids_for_subwords, + "character_pos_to_subword_pos": character_pos_to_subword_pos, + "labels_mask": labels_mask, + "labels": labels, + "spans": spans, + } + b = BytesIO() + pickle.dump(content, b) + b.seek(0) + tarinfo = tarfile.TarInfo(name="example_" + str(i) + ".pkl") + tarinfo.size = b.getbuffer().nbytes + archive.addfile(tarinfo=tarinfo, fileobj=b) + + archive.close() + logging.info("Tar file " + cfg.output_tar_file + " created!") + + +if __name__ == '__main__': + main() diff --git a/examples/nlp/spellchecking_asr_customization/helpers.py b/examples/nlp/spellchecking_asr_customization/helpers.py new file mode 100644 index 000000000000..2db11b0e7d96 --- /dev/null +++ b/examples/nlp/spellchecking_asr_customization/helpers.py @@ -0,0 +1,86 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +from typing import Tuple + +import pytorch_lightning as pl +from omegaconf import DictConfig + +from nemo.collections.nlp.models import SpellcheckingAsrCustomizationModel +from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector +from nemo.utils import logging + +__all__ = ["MODEL", "MODEL_NAMES", "instantiate_model_and_trainer"] + +MODEL = "spellchecking" +MODEL_NAMES = [MODEL] + + +def instantiate_model_and_trainer( + cfg: DictConfig, model_name: str, do_training: bool +) -> Tuple[pl.Trainer, SpellcheckingAsrCustomizationModel]: + """ Function for instantiating a model and a trainer + Args: + cfg: The config used to instantiate the model and the trainer. + model_name: A str indicates the model direction, currently only 'itn'. + do_training: A boolean flag indicates whether the model will be trained or evaluated. + + Returns: + trainer: A PyTorch Lightning trainer + model: A SpellcheckingAsrCustomizationModel + """ + + if model_name not in MODEL_NAMES: + raise ValueError(f"{model_name} is unknown model type") + + # Get configs for the corresponding models + trainer_cfg = cfg.get("trainer") + model_cfg = cfg.get("model") + pretrained_cfg = cfg.get("pretrained_model", None) + trainer = pl.Trainer(**trainer_cfg) + if not pretrained_cfg: + logging.info(f"Initializing {model_name} model") + if model_name == MODEL: + model = SpellcheckingAsrCustomizationModel(model_cfg, trainer=trainer) + else: + raise ValueError(f"{model_name} is unknown model type") + elif os.path.exists(pretrained_cfg): + logging.info(f"Restoring pretrained {model_name} model from {pretrained_cfg}") + save_restore_connector = NLPSaveRestoreConnector() + model = SpellcheckingAsrCustomizationModel.restore_from( + pretrained_cfg, save_restore_connector=save_restore_connector + ) + else: + logging.info(f"Loading pretrained model {pretrained_cfg}") + if model_name == MODEL: + if pretrained_cfg not in SpellcheckingAsrCustomizationModel.get_available_model_names(): + raise ( + ValueError( + f"{pretrained_cfg} not in the list of available Tagger models." + f"Select from {SpellcheckingAsrCustomizationModel.list_available_models()}" + ) + ) + model = SpellcheckingAsrCustomizationModel.from_pretrained(pretrained_cfg) + else: + raise ValueError(f"{model_name} is unknown model type") + + # Setup train and validation data + if do_training: + model.setup_training_data(train_data_config=cfg.data.train_ds) + model.setup_validation_data(val_data_config=cfg.data.validation_ds) + + logging.info(f"Model {model_name} -- Device {model.device}") + return trainer, model diff --git a/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py b/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py new file mode 100644 index 000000000000..871d5e5c0c0c --- /dev/null +++ b/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py @@ -0,0 +1,79 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script is used to postprocess SpellMapper results and generate an updated nemo ASR manifest. +See "examples/nlp/spellchecking_asr_customization/run_infer.sh" for the whole inference pipeline. +""" + +from argparse import ArgumentParser + +from nemo.collections.nlp.data.spellchecking_asr_customization.utils import ( + update_manifest_with_spellmapper_corrections, +) + +parser = ArgumentParser(description="Postprocess SpellMapper results and generate an updated nemo ASR manifest") + +parser.add_argument("--input_manifest", required=True, type=str, help="Path to input nemo ASR manifest") +parser.add_argument( + "--field_name", default="pred_text", type=str, help="Name of json field with original ASR hypothesis text" +) +parser.add_argument( + "--short2full_name", + required=True, + type=str, + help="Path to input file with correspondence between sentence fragments and full sentences", +) +parser.add_argument( + "--spellmapper_results", required=True, type=str, help="Path to input file with SpellMapper inference results" +) +parser.add_argument("--output_manifest", required=True, type=str, help="Path to output nemo ASR manifest") +parser.add_argument("--min_prob", default=0.5, type=float, help="Threshold on replacement probability") +parser.add_argument( + "--use_dp", + action="store_true", + help="Whether to use additional replacement filtering by using dynamic programming", +) +parser.add_argument( + "--replace_hyphen_to_space", + action="store_true", + help="Whether to use space instead of hyphen in replaced fragments", +) +parser.add_argument( + "--ngram_mappings", type=str, required=True, help="File with ngram mappings, only needed if use_dp=true" +) +parser.add_argument( + "--min_dp_score_per_symbol", + default=-1.5, + type=float, + help="Minimum dynamic programming sum score averaged by hypothesis length", +) + +args = parser.parse_args() + +update_manifest_with_spellmapper_corrections( + input_manifest_name=args.input_manifest, + short2full_name=args.short2full_name, + output_manifest_name=args.output_manifest, + spellmapper_results_name=args.spellmapper_results, + min_prob=args.min_prob, + replace_hyphen_to_space=args.replace_hyphen_to_space, + field_name=args.field_name, + use_dp=args.use_dp, + ngram_mappings=args.ngram_mappings, + min_dp_score_per_symbol=args.min_dp_score_per_symbol, +) + +print("Resulting manifest saved to: ", args.output_manifest) diff --git a/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py b/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py new file mode 100644 index 000000000000..6fd5e524390a --- /dev/null +++ b/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py @@ -0,0 +1,129 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script contains an example on how to prepare input for SpellMapper inference from a nemo ASR manifest. +It splits sentences to shorter fragments, runs candidate retrieval and generates input in the required format. +It produces two output files: + 1. File with correspondence between sentence fragments and full sentences. + 2. File that will serve as input for SpellMapper inference. + +See "examples/nlp/spellchecking_asr_customization/run_infer.sh" for the whole inference pipeline. +""" + +from argparse import ArgumentParser + +from nemo.collections.nlp.data.spellchecking_asr_customization.utils import ( + extract_and_split_text_from_manifest, + get_candidates, + load_index, +) + +parser = ArgumentParser(description="Prepare input for SpellMapper inference from a nemo ASR manifest") +parser.add_argument("--manifest", required=True, type=str, help="Path to input manifest file") +parser.add_argument( + "--custom_vocab_index", required=True, type=str, help="Path to input file with custom vocabulary index" +) +parser.add_argument( + "--big_sample", + required=True, + type=str, + help="Path to input file with big sample of phrases to sample dummy candidates if there less than 10 are found by retrieval", +) +parser.add_argument( + "--short2full_name", + required=True, + type=str, + help="Path to output file with correspondence between sentence fragments and full sentences", +) +parser.add_argument( + "--output_name", + required=True, + type=str, + help="Path to output file that will serve as input for SpellMapper inference", +) +parser.add_argument("--field_name", default="pred_text", type=str, help="Name of json field with ASR hypothesis text") +parser.add_argument("--len_in_words", default=16, type=int, help="Maximum fragment length in words") +parser.add_argument( + "--step_in_words", + default=8, + type=int, + help="Step in words for moving to next fragment. If less than len_in_words, fragments will intersect", +) + +args = parser.parse_args() + +# Split ASR hypotheses to shorter fragments, because SpellMapper can't handle arbitrarily long sequences. +# The correspondence between short and original fragments is saved to a file and will be used at post-processing. +extract_and_split_text_from_manifest( + input_name=args.manifest, + output_name=args.short2full_name, + field_name=args.field_name, + len_in_words=args.len_in_words, + step_in_words=args.step_in_words, +) + +# Load index of custom vocabulary from file +phrases, ngram2phrases = load_index(args.custom_vocab_index) + +# Load big sample of phrases to sample dummy candidates if there less than 10 are found by retrieval +big_sample_of_phrases = set() +with open(args.big_sample, "r", encoding="utf-8") as f: + for line in f: + phrase, freq = line.strip().split("\t") + if int(freq) > 50: # do not want to use frequent phrases as dummy candidates + continue + if len(phrase) < 6 or len(phrase) > 15: # do not want to use too short or too long phrases as dummy candidates + continue + big_sample_of_phrases.add(phrase) + +big_sample_of_phrases = list(big_sample_of_phrases) + +# Generate input for SpellMapper inference +out = open(args.output_name, "w", encoding="utf-8") +with open(args.short2full_name, "r", encoding="utf-8") as f: + for line in f: + short_sent, _ = line.strip().split("\t") + sent = "_".join(short_sent.split()) + letters = list(sent) + candidates = get_candidates(ngram2phrases, phrases, letters, big_sample_of_phrases) + if len(candidates) == 0: + continue + if len(candidates) != 10: + raise ValueError("expect 10 candidates, got: ", len(candidates)) + + # We add two columns with targets and span_info. + # They have same format as during training, but start and end positions are APPROXIMATE, they will be adjusted when constructing BertExample. + targets = [] + span_info = [] + for idx, c in enumerate(candidates): + if c[1] == -1: + continue + targets.append(str(idx + 1)) # targets are 1-based + start = c[1] + # ensure that end is not outside sentence length (it can happen because c[2] is candidate length used as approximation) + end = min(c[1] + c[2], len(letters)) + span_info.append("CUSTOM " + str(start) + " " + str(end)) + out.write( + " ".join(letters) + + "\t" + + ";".join([x[0] for x in candidates]) + + "\t" + + " ".join(targets) + + "\t" + + ";".join(span_info) + + "\n" + ) +out.close() diff --git a/examples/nlp/spellchecking_asr_customization/run_infer.sh b/examples/nlp/spellchecking_asr_customization/run_infer.sh new file mode 100644 index 000000000000..09da98171c16 --- /dev/null +++ b/examples/nlp/spellchecking_asr_customization/run_infer.sh @@ -0,0 +1,99 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## RUN INFERENCE ON NEMO MANIFEST AND CUSTOM VOCABULARY + +## Path to NeMo repository +NEMO_PATH=NeMo + +## Download model repo from Hugging Face (if clone doesn't work, run "git lfs install" and try again) +git clone https://huggingface.co/bene-ges/spellmapper_asr_customization_en +## Download repo with test data +git clone https://huggingface.co/datasets/bene-ges/spellmapper_en_evaluation + +## Files in model repo +PRETRAINED_MODEL=spellmapper_asr_customization_en/training_10m_5ep.nemo +NGRAM_MAPPINGS=spellmapper_asr_customization_en/replacement_vocab_filt.txt +BIG_SAMPLE=spellmapper_asr_customization_en/big_sample.txt + +## Override these two files if you want to test on your own data +## File with input nemo ASR manifest +INPUT_MANIFEST=spellmapper_en_evaluation/medical_manifest_ctc.json +## File containing custom words and phrases (plain text) +CUSTOM_VOCAB=spellmapper_en_evaluation/medical_custom_vocab.json + +## Other files will be created +## File with index of custom vocabulary +INDEX="index.txt" +## File with short fragments and corresponding original sentences +SHORT2FULL="short2full.txt" +## File with input for SpellMapper inference +SPELLMAPPER_INPUT="spellmapper_input.txt" +## File with output of SpellMapper inference +SPELLMAPPER_OUTPUT="spellmapper_output.txt" +## File with output nemo ASR manifest +OUTPUT_MANIFEST="out_manifest.json" + + +# Create index of custom vocabulary +python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py \ + --input_name ${CUSTOM_VOCAB} \ + --ngram_mappings ${NGRAM_MAPPINGS} \ + --output_name ${INDEX} \ + --min_log_prob -4.0 \ + --max_phrases_per_ngram 600 + +# Prepare input for SpellMapper inference +python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py \ + --manifest ${INPUT_MANIFEST} \ + --custom_vocab_index ${INDEX} \ + --big_sample ${BIG_SAMPLE} \ + --short2full_name ${SHORT2FULL} \ + --output_name ${SPELLMAPPER_INPUT} \ + --field_name "pred_text" \ + --len_in_words 16 \ + --step_in_words 8 + +# Run SpellMapper inference +python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \ + pretrained_model=${PRETRAINED_MODEL} \ + model.max_sequence_len=512 \ + inference.from_file=${SPELLMAPPER_INPUT} \ + inference.out_file=${SPELLMAPPER_OUTPUT} \ + inference.batch_size=16 \ + lang=en + +# Postprocess and create output corrected manifest +python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \ + --input_manifest ${INPUT_MANIFEST} \ + --short2full_name ${SHORT2FULL} \ + --output_manifest ${OUTPUT_MANIFEST} \ + --spellmapper_result ${SPELLMAPPER_OUTPUT} \ + --replace_hyphen_to_space \ + --field_name "pred_text" \ + --use_dp \ + --ngram_mappings ${NGRAM_MAPPINGS} \ + --min_dp_score_per_symbol -1.5 + +# Check WER of initial manifest +python ${NEMO_PATH}/examples/asr/speech_to_text_eval.py \ + dataset_manifest=${INPUT_MANIFEST} \ + use_cer=False \ + only_score_manifest=True + +# Check WER of corrected manifest +python ${NEMO_PATH}/examples/asr/speech_to_text_eval.py \ + dataset_manifest=${OUTPUT_MANIFEST} \ + use_cer=False \ + only_score_manifest=True diff --git a/examples/nlp/spellchecking_asr_customization/run_training.sh b/examples/nlp/spellchecking_asr_customization/run_training.sh new file mode 100644 index 000000000000..85dddbb2a038 --- /dev/null +++ b/examples/nlp/spellchecking_asr_customization/run_training.sh @@ -0,0 +1,56 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## TRAIN WITH NON-TARRED DATA + +# Path to NeMo repository +NEMO_PATH=NeMo + +## Download repo with training data (very small example) +## If clone doesn't work, run "git lfs install" and try again +git clone https://huggingface.co/datasets/bene-ges/spellmapper_en_train_micro + +DATA_PATH=spellmapper_en_train_micro + +## Example of all files needed to run training with non-tarred data: +## spellmapper_en_train_micro +## ├── config.json +##   ├── label_map.txt +##   ├── semiotic_classes.txt +## ├── test.tsv +## └── train.tsv + +## To generate files config.json, label_map.txt, semiotic_classes.txt - run generate_configs.sh +## Files "train.tsv" and "test.tsv" contain training examples. +## For data preparation see https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/dataset_preparation/build_training_data.sh + +## Note that training with non-tarred data only works on single gpu. It makes sense if you use 1-2 million examples or less. + +python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py \ + lang="en" \ + data.validation_ds.data_path=${DATA_PATH}/test.tsv \ + data.train_ds.data_path=${DATA_PATH}/train.tsv \ + data.train_ds.batch_size=32 \ + data.train_ds.num_workers=8 \ + model.max_sequence_len=512 \ + model.language_model.pretrained_model_name=huawei-noah/TinyBERT_General_6L_768D \ + model.language_model.config_file=${DATA_PATH}/config.json \ + model.label_map=${DATA_PATH}/label_map.txt \ + model.semiotic_classes=${DATA_PATH}/semiotic_classes.txt \ + model.optim.lr=3e-5 \ + trainer.devices=[1] \ + trainer.num_nodes=1 \ + trainer.accelerator=gpu \ + trainer.strategy=ddp \ + trainer.max_epochs=5 diff --git a/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh b/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh new file mode 100644 index 000000000000..655c3e23e610 --- /dev/null +++ b/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh @@ -0,0 +1,63 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## TRAIN WITH TARRED DATA + +# Path to NeMo repository +NEMO_PATH=NeMo + +DATA_PATH=data_folder + +## data_folder_example +## ├── train_tarred +## | ├── part1.tar +## | ├── ... +## | └── part200.tar +## ├── config.json +##   ├── label_map.txt +##   ├── semiotic_classes.txt +## └── test.tsv +## To generate files config.json, label_map.txt, semiotic_classes.txt, run generate_configs.sh +## To prepare data, see ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/dataset_preparation/build_training_data.sh +## To convert data to tarred format, split all.tsv to pieces of 110'000 examples (except for validation part) and use ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/dataset_preparation/convert_data_to_tarred.sh +## To run training with tarred data, use ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh + +## ATTENTION: How to calculate model.optim.sched.max_steps: +## Suppose, you have 2'000'000 training examples, and want to train for 5 epochs on 4 gpus with batch size 32. +## 5 (epochs) * 32 (bs) * 4 (gpus) +## 1 step consumes 128 examples (32(bs) * 4(gpus)) +## 1 epoch makes 2000000/128=15625 steps (updates) +## 5 epochs make 5*15625=78125 steps + +python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py \ + lang="en" \ + data.validation_ds.data_path=${DATA_PATH}/test.tsv \ + data.train_ds.data_path=${DATA_PATH}/train_tarred/part_OP_1..100_CL_.tar \ + data.train_ds.batch_size=32 \ + data.train_ds.num_workers=16 \ + +data.train_ds.use_tarred_dataset=true \ + data.train_ds.shuffle=false \ + data.validation_ds.batch_size=16 \ + model.max_sequence_len=512 \ + model.language_model.pretrained_model_name=huawei-noah/TinyBERT_General_6L_768D \ + model.language_model.config_file=${DATA_PATH}/config.json \ + model.label_map=${DATA_PATH}/label_map.txt \ + model.semiotic_classes=${DATA_PATH}/semiotic_classes.txt \ + model.optim.sched.name=CosineAnnealing \ + +model.optim.sched.max_steps=195313 \ + trainer.devices=8 \ + trainer.num_nodes=1 \ + trainer.accelerator=gpu \ + trainer.strategy=ddp \ + trainer.max_epochs=5 diff --git a/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py b/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py new file mode 100644 index 000000000000..593264f14a5d --- /dev/null +++ b/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py @@ -0,0 +1,123 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script contains an example on how to run inference with the SpellcheckingAsrCustomizationModel. + +An input line should consist of 4 tab-separated columns: + 1. text of ASR-hypothesis + 2. texts of 10 candidates separated by semicolon + 3. 1-based ids of non-dummy candidates + 4. approximate start/end coordinates of non-dummy candidates (correspond to ids in third column) + +Example input (in one line): + t h e _ t a r a s i c _ o o r d a _ i s _ a _ p a r t _ o f _ t h e _ a o r t a _ l o c a t e d _ i n _ t h e _ t h o r a x + h e p a t i c _ c i r r h o s i s;u r a c i l;c a r d i a c _ a r r e s t;w e a n;a p g a r;p s y c h o m o t o r;t h o r a x;t h o r a c i c _ a o r t a;a v f;b l o c k a d e d + 1 2 6 7 8 9 10 + CUSTOM 6 23;CUSTOM 4 10;CUSTOM 4 15;CUSTOM 56 62;CUSTOM 5 19;CUSTOM 28 31;CUSTOM 39 48 + +Each line in SpellMapper output is tab-separated and consists of 4 columns: + 1. ASR-hypothesis (same as in input) + 2. 10 candidates separated with semicolon (same as in input) + 3. fragment predictions, separated with semicolon, each prediction is a tuple (start, end, candidate_id, probability) + 4. letter predictions - candidate_id predicted for each letter (this is only for debug purposes) + +Example output (in one line): + t h e _ t a r a s i c _ o o r d a _ i s _ a _ p a r t _ o f _ t h e _ a o r t a _ l o c a t e d _ i n _ t h e _ t h o r a x + h e p a t i c _ c i r r h o s i s;u r a c i l;c a r d i a c _ a r r e s t;w e a n;a p g a r;p s y c h o m o t o r;t h o r a x;t h o r a c i c _ a o r t a;a v f;b l o c k a d e d + 56 62 7 0.99998;4 20 8 0.95181;12 20 8 0.44829;4 17 8 0.99464;12 17 8 0.97645 + 8 8 8 0 8 8 8 8 8 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 7 7 7 7 7 + + +USAGE Example: +1. Train a model, or use a pretrained checkpoint. +2. Run on a single file: + python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \ + pretrained_model=${PRETRAINED_NEMO_CHECKPOINT} \ + model.max_sequence_len=512 \ + inference.from_file=input.txt \ + inference.out_file=output.txt \ + inference.batch_size=16 \ + lang=en +or on multiple files: + python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \ + pretrained_model=${PRETRAINED_NEMO_CHECKPOINT} \ + model.max_sequence_len=512 \ + +inference.from_filelist=filelist.txt \ + +inference.output_folder=output_folder \ + inference.batch_size=16 \ + lang=en + +This script uses the `/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml` +config file by default. The other option is to set another config file via command +line arguments by `--config-name=CONFIG_FILE_PATH'. +""" + + +import os + +from helpers import MODEL, instantiate_model_and_trainer +from omegaconf import DictConfig, OmegaConf + +from nemo.core.config import hydra_runner +from nemo.utils import logging + + +@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config") +def main(cfg: DictConfig) -> None: + logging.debug(f'Config Params: {OmegaConf.to_yaml(cfg)}') + + if cfg.pretrained_model is None: + raise ValueError("A pre-trained model should be provided.") + _, model = instantiate_model_and_trainer(cfg, MODEL, False) + + if cfg.model.max_sequence_len != model.max_sequence_len: + model.max_sequence_len = cfg.model.max_sequence_len + model.builder._max_seq_length = cfg.model.max_sequence_len + input_filenames = [] + output_filenames = [] + + if "from_filelist" in cfg.inference and "output_folder" in cfg.inference: + filelist_file = cfg.inference.from_filelist + output_folder = cfg.inference.output_folder + with open(filelist_file, "r", encoding="utf-8") as f: + for line in f: + path = line.strip() + input_filenames.append(path) + folder, name = os.path.split(path) + output_filenames.append(os.path.join(output_folder, name)) + else: + text_file = cfg.inference.from_file + logging.info(f"Running inference on {text_file}...") + if not os.path.exists(text_file): + raise ValueError(f"{text_file} not found.") + input_filenames.append(text_file) + output_filenames.append(cfg.inference.out_file) + + dataloader_cfg = { + "batch_size": cfg.inference.get("batch_size", 8), + "num_workers": cfg.inference.get("num_workers", 4), + "pin_memory": cfg.inference.get("num_workers", False), + } + for input_filename, output_filename in zip(input_filenames, output_filenames): + if not os.path.exists(input_filename): + logging.info(f"Skip non-existing {input_filename}.") + continue + model.infer(dataloader_cfg, input_filename, output_filename) + logging.info(f"Predictions saved to {output_filename}.") + + +if __name__ == "__main__": + main() diff --git a/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py b/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py new file mode 100644 index 000000000000..7ea9314d196d --- /dev/null +++ b/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py @@ -0,0 +1,66 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script contains an example on how to train SpellMapper (SpellcheckingAsrCustomizationModel). +It uses the `examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml` +config file by default. The other option is to set another config file via command +line arguments by `--config-name=CONFIG_FILE_PATH'. Probably it is worth looking +at the example config file to see the list of parameters used for training. + +USAGE Example: + See `examples/nlp/spellchecking_asr_customization/run_training.sh` for training on non-tarred data. + and + `examples/nlp/spellchecking_asr_customization/run_training_tarred.sh` for training on tarred data. + +One (non-tarred) training example should consist of 4 tab-separated columns: + 1. text of ASR-hypothesis + 2. texts of 10 candidates separated by semicolon + 3. 1-based ids of correct candidates, or 0 if none + 4. start/end coordinates of correct candidates (correspond to ids in third column) +Example (in one line): + a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o + d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y + 1 3 + CUSTOM 12 23;CUSTOM 28 41 +""" + +from helpers import MODEL, instantiate_model_and_trainer +from omegaconf import DictConfig, OmegaConf + +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager + + +@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config") +def main(cfg: DictConfig) -> None: + logging.info(f'Config Params: {OmegaConf.to_yaml(cfg)}') + + # Train the model + if cfg.model.do_training: + logging.info( + "================================================================================================" + ) + logging.info('Start training...') + trainer, model = instantiate_model_and_trainer(cfg, MODEL, True) + spellchecking_exp_manager = cfg.get('exp_manager', None) + exp_manager(trainer, spellchecking_exp_manager) + trainer.fit(model) + logging.info('Training finished!') + + +if __name__ == '__main__': + main() diff --git a/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py b/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py index e2ae48a37a0b..f5a53b1f331d 100644 --- a/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py +++ b/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py @@ -19,9 +19,14 @@ import re from argparse import ArgumentParser -from typing import List, Tuple -import numpy as np +from nemo.collections.nlp.data.text_normalization_as_tagging.utils import ( + check_monotonicity, + fill_alignment_matrix, + get_targets, + get_targets_from_back, +) + parser = ArgumentParser(description='Extract final alignments from GIZA++ alignments') parser.add_argument('--mode', type=str, required=True, help='tn or itn') @@ -34,211 +39,13 @@ args = parser.parse_args() -def fill_alignment_matrix( - fline2: str, fline3: str, gline2: str, gline3: str -) -> Tuple[np.ndarray, List[str], List[str]]: - """Parse Giza++ direct and reverse alignment results and represent them as an alignment matrix - - Args: - fline2: e.g. "_2 0 1 4_" - fline3: e.g. "NULL ({ }) twenty ({ 1 }) fourteen ({ 2 3 4 })" - gline2: e.g. "twenty fourteen" - gline3: e.g. "NULL ({ }) _2 ({ 1 }) 0 ({ }) 1 ({ }) 4_ ({ 2 })" - - Returns: - matrix: a numpy array of shape (src_len, dst_len) filled with [0, 1, 2, 3], where 3 means a reliable alignment - the corresponding words were aligned to one another in direct and reverse alignment runs, 1 and 2 mean that the - words were aligned only in one direction, 0 - no alignment. - srctokens: e.g. ["twenty", "fourteen"] - dsttokens: e.g. ["_2", "0", "1", "4_"] - - For example, the alignment matrix for the above example may look like: - [[3, 0, 0, 0] - [0, 2, 2, 3]] - """ - if fline2 is None or gline2 is None or fline3 is None or gline3 is None: - raise ValueError(f"empty params") - srctokens = gline2.split() - dsttokens = fline2.split() - pattern = r"([^ ]+) \(\{ ([^\(\{\}\)]*) \}\)" - src2dst = re.findall(pattern, fline3.replace("({ })", "({ })")) - dst2src = re.findall(pattern, gline3.replace("({ })", "({ })")) - if len(src2dst) != len(srctokens) + 1: - raise ValueError( - "length mismatch: len(src2dst)=" - + str(len(src2dst)) - + "; len(srctokens)" - + str(len(srctokens)) - + "\n" - + gline2 - + "\n" - + fline3 - ) - if len(dst2src) != len(dsttokens) + 1: - raise ValueError( - "length mismatch: len(dst2src)=" - + str(len(dst2src)) - + "; len(dsttokens)" - + str(len(dsttokens)) - + "\n" - + fline2 - + "\n" - + gline3 - ) - matrix = np.zeros((len(srctokens), len(dsttokens))) - for i in range(1, len(src2dst)): - token, to_str = src2dst[i] - if to_str == "": - continue - to = list(map(int, to_str.split())) - for t in to: - matrix[i - 1][t - 1] = 2 - - for i in range(1, len(dst2src)): - token, to_str = dst2src[i] - if to_str == "": - continue - to = list(map(int, to_str.split())) - for t in to: - matrix[t - 1][i - 1] += 1 - - return matrix, srctokens, dsttokens - - -def check_monotonicity(matrix: np.ndarray) -> bool: - """Check if alignment is monotonous - i.e. the relative order is preserved (no swaps). - - Args: - matrix: a numpy array of shape (src_len, dst_len) filled with [0, 1, 2, 3], where 3 means a reliable alignment - the corresponding words were aligned to one another in direct and reverse alignment runs, 1 and 2 mean that the - words were aligned only in one direction, 0 - no alignment. - """ - is_sorted = lambda k: np.all(k[:-1] <= k[1:]) - - a = np.argwhere(matrix == 3) - b = np.argwhere(matrix == 2) - c = np.vstack((a, b)) - d = c[c[:, 1].argsort()] # sort by second column (less important) - d = d[d[:, 0].argsort(kind="mergesort")] - return is_sorted(d[:, 1]) - - -def get_targets(matrix: np.ndarray, dsttokens: List[str]) -> List[str]: - """Join some of the destination tokens, so that their number becomes the same as the number of input words. - Unaligned tokens tend to join to the left aligned token. - - Args: - matrix: a numpy array of shape (src_len, dst_len) filled with [0, 1, 2, 3], where 3 means a reliable alignment - the corresponding words were aligned to one another in direct and reverse alignment runs, 1 and 2 mean that the - words were aligned only in one direction, 0 - no alignment. - dsttokens: e.g. ["_2", "0", "1", "4_"] - Returns: - targets: list of string tokens, with one-to-one correspondence to matrix.shape[0] - - Example: - If we get - matrix=[[3, 0, 0, 0] - [0, 2, 2, 3]] - dsttokens=["_2", "0", "1", "4_"] - it gives - targets = ["_201", "4_"] - Actually, this is a mistake instead of ["_20", "14_"]. That will be further corrected by regular expressions. - """ - targets = [] - last_covered_dst_id = -1 - for i in range(len(matrix)): - dstlist = [] - for j in range(last_covered_dst_id + 1, len(dsttokens)): - # matrix[i][j] == 3: safe alignment point - if matrix[i][j] == 3 or ( - j == last_covered_dst_id + 1 - and np.all(matrix[i, :] == 0) # if the whole line does not have safe points - and np.all(matrix[:, j] == 0) # and the whole column does not have safe points, match them - ): - if len(targets) == 0: # if this is first safe point, attach left unaligned columns to it, if any - for k in range(0, j): - if np.all(matrix[:, k] == 0): # if column k does not have safe points - dstlist.append(dsttokens[k]) - else: - break - dstlist.append(dsttokens[j]) - last_covered_dst_id = j - for k in range(j + 1, len(dsttokens)): - if np.all(matrix[:, k] == 0): # if column k does not have safe points - dstlist.append(dsttokens[k]) - last_covered_dst_id = k - else: - break - - if len(dstlist) > 0: - if args.mode == "tn": - targets.append("_".join(dstlist)) - else: - targets.append("".join(dstlist)) - else: - targets.append("") - return targets - - -def get_targets_from_back(matrix: np.ndarray, dsttokens: List[str]) -> List[str]: - """Join some of the destination tokens, so that their number becomes the same as the number of input words. - Unaligned tokens tend to join to the right aligned token. - - Args: - matrix: a numpy array of shape (src_len, dst_len) filled with [0, 1, 2, 3], where 3 means a reliable alignment - the corresponding words were aligned to one another in direct and reverse alignment runs, 1 and 2 mean that the - words were aligned only in one direction, 0 - no alignment. - dsttokens: e.g. ["_2", "0", "1", "4_"] - Returns: - targets: list of string tokens, with one-to-one correspondence to matrix.shape[0] - - Example: - If we get - matrix=[[3, 0, 0, 0] - [0, 2, 2, 3]] - dsttokens=["_2", "0", "1", "4_"] - it gives - targets = ["_2", "014_"] - Actually, this is a mistake instead of ["_20", "14_"]. That will be further corrected by regular expressions. - """ - - targets = [] - last_covered_dst_id = len(dsttokens) - for i in range(len(matrix) - 1, -1, -1): - dstlist = [] - for j in range(last_covered_dst_id - 1, -1, -1): - if matrix[i][j] == 3 or ( - j == last_covered_dst_id - 1 and np.all(matrix[i, :] == 0) and np.all(matrix[:, j] == 0) - ): - if len(targets) == 0: - for k in range(len(dsttokens) - 1, j, -1): - if np.all(matrix[:, k] == 0): - dstlist.append(dsttokens[k]) - else: - break - dstlist.append(dsttokens[j]) - last_covered_dst_id = j - for k in range(j - 1, -1, -1): - if np.all(matrix[:, k] == 0): - dstlist.append(dsttokens[k]) - last_covered_dst_id = k - else: - break - if len(dstlist) > 0: - if args.mode == "tn": - targets.append("_".join(list(reversed(dstlist)))) - else: - targets.append("".join(list(reversed(dstlist)))) - else: - targets.append("") - return list(reversed(targets)) - - def main() -> None: g = open(args.giza_dir + "/GIZA++." + args.giza_suffix, "r", encoding="utf-8") f = open(args.giza_dir + "/GIZA++reverse." + args.giza_suffix, "r", encoding="utf-8") + target_inner_delimiter = "" if args.mode == "tn": g, f = f, g + target_inner_delimiter = "_" out = open(args.giza_dir + "/" + args.out_filename, "w", encoding="utf-8") cache = {} good_count, not_mono_count, not_covered_count, exception_count = 0, 0, 0, 0 @@ -277,8 +84,8 @@ def main() -> None: else: matrix[matrix <= 2] = 0 # leave only 1-to-1 alignment points if check_monotonicity(matrix): - targets = get_targets(matrix, dsttokens) - targets_from_back = get_targets_from_back(matrix, dsttokens) + targets = get_targets(matrix, dsttokens, delimiter=target_inner_delimiter) + targets_from_back = get_targets_from_back(matrix, dsttokens, delimiter=target_inner_delimiter) if len(targets) != len(srctokens): raise ValueError( "targets length doesn't match srctokens length: len(targets)=" diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/__init__.py b/nemo/collections/nlp/data/spellchecking_asr_customization/__init__.py new file mode 100644 index 000000000000..4e786276108c --- /dev/null +++ b/nemo/collections/nlp/data/spellchecking_asr_customization/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from nemo.collections.nlp.data.spellchecking_asr_customization.dataset import ( + SpellcheckingAsrCustomizationDataset, + SpellcheckingAsrCustomizationTestDataset, + TarredSpellcheckingAsrCustomizationDataset, +) diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py b/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py new file mode 100644 index 000000000000..803d0eaf8aed --- /dev/null +++ b/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py @@ -0,0 +1,593 @@ +# Copyright 2019 The Google Research Authors. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from collections import OrderedDict +from os import path +from typing import Dict, List, Optional, Tuple, Union + +from transformers import PreTrainedTokenizerBase + +"""Build BERT Examples from asr hypothesis, customization candidates, target labels, span info. +""" + + +class BertExample(object): + """Class for training and inference examples for BERT. + + Attributes: + features: Feature dictionary. + """ + + def __init__( + self, + input_ids: List[int], + input_mask: List[int], + segment_ids: List[int], + input_ids_for_subwords: List[int], + input_mask_for_subwords: List[int], + segment_ids_for_subwords: List[int], + character_pos_to_subword_pos: List[int], + fragment_indices: List[Tuple[int, int, int]], + labels_mask: List[int], + labels: List[int], + spans: List[Tuple[int, int, int]], + default_label: int, + ) -> None: + """Inputs to the example wrapper + + Args: + input_ids: indices of single characters (treated as subwords) + input_mask: list of bools with 0s in place of input_ids to be masked + segment_ids: list of ints from 0 to 10 to denote the text segment type ( + 0 - for tokens of ASR hypothesis, + 1 - for tokens of the first candidate + ... + 10 - for tokens of the tenth candidate + ) + input_ids_for_subwords: indices of real subwords (as tokenized by bert tokenizer) + input_mask_for_subwords: list of bools with 0s in place of input_ids_for_subwords to be masked + segment_ids_for_subwords: same as segment_ids but for input_ids_for_subwords + character_pos_to_subword_pos: list of size=len(input_ids), value=(position of corresponding subword in input_ids_for_subwords) + fragment_indices: list of tuples (start_position, end_position, candidate_id), end is exclusive, candidate_id can be -1 if not set + labels_mask: bool tensor with 0s in place of label tokens to be masked + labels: indices of semiotic classes which should be predicted from each of the + corresponding input tokens + spans: list of tuples (class_id, start_position, end_position), end is exclusive, class is always 1(CUSTOM) + default_label: The default label + """ + input_len = len(input_ids) + if not ( + input_len == len(input_mask) + and input_len == len(segment_ids) + and input_len == len(labels_mask) + and input_len == len(labels) + and input_len == len(character_pos_to_subword_pos) + ): + raise ValueError("All feature lists should have the same length ({})".format(input_len)) + + input_len_for_subwords = len(input_ids_for_subwords) + if not ( + input_len_for_subwords == len(input_mask_for_subwords) + and input_len_for_subwords == len(segment_ids_for_subwords) + ): + raise ValueError( + "All feature lists for subwords should have the same length ({})".format(input_len_for_subwords) + ) + + self.features = OrderedDict( + [ + ("input_ids", input_ids), + ("input_mask", input_mask), + ("segment_ids", segment_ids), + ("input_ids_for_subwords", input_ids_for_subwords), + ("input_mask_for_subwords", input_mask_for_subwords), + ("segment_ids_for_subwords", segment_ids_for_subwords), + ("character_pos_to_subword_pos", character_pos_to_subword_pos), + ("fragment_indices", fragment_indices), + ("labels_mask", labels_mask), + ("labels", labels), + ("spans", spans), + ] + ) + self._default_label = default_label + + +class BertExampleBuilder(object): + """Builder class for BertExample objects.""" + + def __init__( + self, + label_map: Dict[str, int], + semiotic_classes: Dict[str, int], + tokenizer: PreTrainedTokenizerBase, + max_seq_length: int, + ) -> None: + """Initializes an instance of BertExampleBuilder. + + Args: + label_map: Mapping from tags to tag IDs. + semiotic_classes: Mapping from semiotic classes to their ids. + tokenizer: Tokenizer object. + max_seq_length: Maximum sequence length. + """ + self._label_map = label_map + self._semiotic_classes = semiotic_classes + self._tokenizer = tokenizer + self._max_seq_length = max_seq_length + # one span usually covers one or more words and it only exists for custom phrases, so there are much less spans than characters. + self._max_spans_length = max(4, int(max_seq_length / 20)) + self._pad_id = self._tokenizer.pad_token_id + self._default_label = 0 + + def build_bert_example( + self, hyp: str, ref: str, target: Optional[str] = None, span_info: Optional[str] = None, infer: bool = False + ) -> Optional[BertExample]: + """Constructs a BERT Example. + + Args: + hyp: Hypothesis text. + ref: Candidate customization variants divided by ';' + target: + if infer==False, string of labels (each label is 1-based index of correct candidate) or 0. + if infer==True, it can be None or string of labels (each label is 1-based index of some candidate). In inference this can be used to get corresponding fragments to fragment_indices. + span_info: + string of format "CUSTOM 6 20;CUSTOM 40 51", number of parts corresponds to number of targets. Can be empty if target is 0. + If infer==False, numbers are correct start and end(exclusive) positions of the corresponding target candidate in the text. + If infer==True, numbers are EXPECTED positions in the text. In inference this can be used to get corresponding fragments to fragment_indices. + infer: inference mode + Returns: + BertExample, or None if the conversion from text to tags was infeasible + + Example (infer=False): + hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" + ref: "d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y" + target: "1 3" + span_info: "CUSTOM 12 23;CUSTOM 28 41" + """ + if not ref.count(";") == 9: + raise ValueError("Expect 10 candidates: " + ref) + + span_info_parts = [] + targets = [] + + if len(target) > 0 and target != "0": + span_info_parts = span_info.split(";") + targets = list(map(int, target.split(" "))) + if len(span_info_parts) != len(targets): + raise ValueError( + "len(span_info_parts)=" + + str(len(span_info_parts)) + + " is different from len(target_parts)=" + + str(len(targets)) + ) + + tags = [0 for _ in hyp.split()] + if not infer: + for p, t in zip(span_info_parts, targets): + c, start, end = p.split(" ") + start = int(start) + end = int(end) + tags[start:end] = [t for i in range(end - start)] + + # get input features for characters + (input_ids, input_mask, segment_ids, labels_mask, labels, _, _,) = self._get_input_features( + hyp=hyp, ref=ref, tags=tags + ) + + # get input features for words + hyp_with_words = hyp.replace(" ", "").replace("_", " ") + ref_with_words = ref.replace(" ", "").replace("_", " ") + ( + input_ids_for_subwords, + input_mask_for_subwords, + segment_ids_for_subwords, + _, + _, + _, + _, + ) = self._get_input_features(hyp=hyp_with_words, ref=ref_with_words, tags=None) + + # used in forward to concatenate subword embeddings to character embeddings + character_pos_to_subword_pos = self._map_characters_to_subwords(input_ids, input_ids_for_subwords) + + fragment_indices = [] + if infer: + # used in inference to take argmax over whole fragments instead of separate characters to get more consistent predictions + fragment_indices = self._get_fragment_indices(hyp, targets, span_info_parts) + + spans = [] + if not infer: + # during training spans are used in validation step to calculate accuracy on whole custom phrases instead of separate characters + spans = self._get_spans(span_info_parts) + + if len(input_ids) > self._max_seq_length or len(spans) > self._max_spans_length: + print( + "Max len exceeded: len(input_ids)=", + len(input_ids), + "; _max_seq_length=", + self._max_seq_length, + "; len(spans)=", + len(spans), + "; _max_spans_length=", + self._max_spans_length, + ) + return None + + example = BertExample( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + input_ids_for_subwords=input_ids_for_subwords, + input_mask_for_subwords=input_mask_for_subwords, + segment_ids_for_subwords=segment_ids_for_subwords, + character_pos_to_subword_pos=character_pos_to_subword_pos, + fragment_indices=fragment_indices, + labels_mask=labels_mask, + labels=labels, + spans=spans, + default_label=self._default_label, + ) + return example + + def _get_spans(self, span_info_parts: List[str]) -> List[Tuple[int, int, int]]: + """ Converts span_info string into a list of (class_id, start, end) where start, end are coordinates of starting and ending(exclusive) tokens in input_ids of BertExample + + Example: + span_info_parts: ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"] + result: [(1, 38, 42), (1, 48, 53), (1, 43, 47), (1, 1, 8)] + """ + result_spans = [] + + for p in span_info_parts: + if p == "": + break + c, start, end = p.split(" ") + if c not in self._semiotic_classes: + raise KeyError("class=" + c + " not found in self._semiotic_classes") + cid = self._semiotic_classes[c] + # +1 because this should be indexing on input_ids which has [CLS] token at beginning + start = int(start) + 1 + end = int(end) + 1 + result_spans.append((cid, start, end)) + return result_spans + + def _get_fragment_indices( + self, hyp: str, targets: List[int], span_info_parts: List[str] + ) -> Tuple[List[Tuple[int, int, int]]]: + """ Build fragment indices for real candidates. + This is used only at inference. + After external candidate retrieval we know approximately, where the candidate is located in the text (from the positions of matched n-grams). + In this function we + 1) adjust start/end positions to match word borders (possibly in multiple ways). + 2) generate content for fragment_indices tensor (it will be used during inference to average all predictions inside each fragment). + + Args: + hyp: ASR-hypothesis where space separates single characters (real space is replaced to underscore). + targets: list of candidate ids (only for real candidates, not dummy) + span_info_parts: list of strings of format like "CUSTOM 12 25", corresponding to each of targets, with start/end coordinates in text. + Returns: + List of tuples (start, end, target) where start and end are positions in ASR-hypothesis, target is candidate_id. + Note that returned fragments can be unsorted and can overlap, it's ok. + Example: + hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" + targets: [1 2 3 4 6 7 9] + span_info_parts: ["CUSTOM 12 25", "CUSTOM 0 10", "CUSTOM 27 42", ...], where numbers are EXPECTED start/end positions of corresponding target candidates in the text. These positions will be adjusted in this functuion. + fragment_indices: [(1, 12, 2), (13, 24, 1), (13, 28, 1), ..., (29, 42, 3)] + """ + + fragment_indices = [] + + letters = hyp.split() + + for target, p in zip(targets, span_info_parts): + _, start, end = p.split(" ") + start = int(start) + end = min(int(end), len(hyp)) # guarantee that end is not outside length + + # Adjusting strategy 1: expand both sides to the nearest space. + # Adjust start by finding the nearest left space or beginning of text. If start is already some word beginning, it won't change. + k = start + while k > 0 and letters[k] != '_': + k -= 1 + adjusted_start = k if k == 0 else k + 1 + + # Adjust end by finding the nearest right space. If end is already space or sentence end, it won't change. + k = end + while k < len(letters) and letters[k] != '_': + k += 1 + adjusted_end = k + + # +1 because this should be indexing on input_ids which has [CLS] token at beginning + fragment_indices.append((adjusted_start + 1, adjusted_end + 1, target)) + + # Adjusting strategy 2: try to shrink to the closest space (from left or right or both sides). + # For example, here the candidate "shippers" has a matching n-gram covering part of previous word + # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w + # 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 + expanded_fragment = "".join(letters[adjusted_start:adjusted_end]) + left_space_position = expanded_fragment.find("_") + right_space_position = expanded_fragment.rfind("_") + is_left_shrink = False + is_right_shrink = False + if left_space_position > -1 and left_space_position < len(expanded_fragment) / 2: + # +1 because of CLS token, another +1 to put start position after found space + fragment_indices.append((adjusted_start + 1 + left_space_position + 1, adjusted_end + 1, target)) + is_left_shrink = True + if right_space_position > -1 and right_space_position > len(expanded_fragment) / 2: + fragment_indices.append((adjusted_start + 1, adjusted_start + 1 + right_space_position, target)) + is_right_shrink = True + if is_left_shrink and is_right_shrink: + fragment_indices.append( + (adjusted_start + 1 + left_space_position + 1, adjusted_start + 1 + right_space_position, target) + ) + + return fragment_indices + + def _map_characters_to_subwords(self, input_ids: List[int], input_ids_for_subwords: List[int]) -> List[int]: + """ Maps each single character to the position of its corresponding subword. + + Args: + input_ids: List of character token ids. + input_ids_for_subwords: List of subword token ids. + Returns: + List of subword positions in input_ids_for_subwords. Its length is equal to len(input_ids) + + Example: + input_ids: [101, 1037, 1055, 1056, 1054, 1051, 1050, ..., 1051, 102, 1040, ..., 1050, 102, 1037, ..., 1041, 102, ..., 102] + input_ids_for_subwords: [101, 26357, 2106, 2666, 2061, 8202, 1998, 13012, 16643, 2319, 1043, 7174, 102, 2106, 3771, 7842, 2819, 2239, 102, ..., 102] + result: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, ... , 45, 46, 46, 46, 46, 46, 47] + """ + character_pos_to_subword_pos = [0 for _ in input_ids] + + ## '[CLS]', 'a', 's', 't', 'r', 'o', 'n', 'o', 'm', 'e', 'r', 's', '_', 'd', 'i', ..., 'l', 'o', '[SEP]', 'd', 'i', 'd', 'i', 'e', 'r', '_', 's', 'a', 'u', 'm', 'o', 'n', ..., '[SEP]' + tokens = self._tokenizer.convert_ids_to_tokens(input_ids) + ## '[CLS]', 'astronomers', 'did', '##ie', 'so', '##mon', 'and', 'tri', '##sti', '##an', 'g', '##llo', '[SEP]', 'did', '##ier', 'sa', '##um', '##on', '[SEP]', 'astro', '##no', '##mie', '[SEP]', 'tristan', 'gui', '##llo', '##t', '[SEP]', ..., '[SEP]', 'mercy', '[SEP]'] + tokens_for_subwords = self._tokenizer.convert_ids_to_tokens(input_ids_for_subwords) + j = 0 # index for tokens_for_subwords + j_offset = 0 # current letter index within subword + for i in range(len(tokens)): + character = tokens[i] + subword = tokens_for_subwords[j] + if character == "[CLS]" and subword == "[CLS]": + character_pos_to_subword_pos[i] = j + j += 1 + continue + if character == "[SEP]" and subword == "[SEP]": + character_pos_to_subword_pos[i] = j + j += 1 + continue + if character == "[CLS]" or character == "[SEP]" or subword == "[CLS]" or subword == "[SEP]": + raise IndexError( + "character[" + + str(i) + + "]=" + + character + + "; subword[" + + str(j) + + ";=" + + subword + + "subwords=" + + str(tokens_for_subwords) + ) + # At this point we expect that + # subword either 1) is a normal first token of a word or 2) starts with "##" (not first word token) + # character either 1) is a normal character or 2) is a space character "_" + if character == "_": + character_pos_to_subword_pos[i] = j - 1 # space is assigned to previous subtoken + continue + if j_offset < len(subword): + if character == subword[j_offset]: + character_pos_to_subword_pos[i] = j + j_offset += 1 + else: + raise IndexError( + "character mismatch:" + + "i=" + + str(i) + + "j=" + + str(j) + + "j_offset=" + + str(j_offset) + + "; len(tokens)=" + + str(len(tokens)) + + "; len(subwords)=" + + str(len(tokens_for_subwords)) + ) + # if subword is finished, increase j + if j_offset >= len(subword): + j += 1 + j_offset = 0 + if j >= len(tokens_for_subwords): + break + if tokens_for_subwords[j].startswith("##"): + j_offset = 2 + # check that all subword tokens are processed + if j < len(tokens_for_subwords): + raise IndexError( + "j=" + + str(j) + + "; len(tokens)=" + + str(len(tokens)) + + "; len(subwords)=" + + str(len(tokens_for_subwords)) + ) + return character_pos_to_subword_pos + + def _get_input_features( + self, hyp: str, ref: str, tags: List[int] + ) -> Tuple[List[int], List[int], List[int], List[int], List[int], List[str], List[int]]: + """Converts given ASR-hypothesis(hyp) and candidate string(ref) to features(token ids, mask, segment ids, etc). + + Args: + hyp: Hypothesis text. + ref: Candidate customization variants divided by ';' + tags: List of labels corresponding to each token of ASR-hypothesis or None when building an example during inference. + Returns: + Features (input_ids, input_mask, segment_ids, labels_mask, labels, hyp_tokens, token_start_indices) + + Note that this method is called both for character-based example and for word-based example (to split to subwords). + + Character-based example: + hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" + ref: "d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y" + tags: "0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3" + + resulting token sequence: + '[CLS]', 'a', 's', 't', 'r', 'o', 'n', 'o', 'm', 'e', 'r', 's', '_', 'd', 'i', ..., 'l', 'o', '[SEP]', 'd', 'i', 'd', 'i', 'e', 'r', '_', 's', 'a', 'u', 'm', 'o', 'n', ..., '[SEP]' + + Word-based example: + hyp: "astronomers didie somon and tristian gllo" + ref: "didier saumon;astronomie;tristan guillot;tristesse;monade;christian;astronomer;solomon;dididididi;mercy" + tags: None (not used for word-based case) + + resulting token sequence: + '[CLS]', 'astronomers', 'did', '##ie', 'so', '##mon', 'and', 'tri', '##sti', '##an', 'g', '##llo', '[SEP]', 'did', '##ier', 'sa', '##um', '##on', '[SEP]', 'astro', '##no', '##mie', '[SEP]', 'tristan', 'gui', '##llo', '##t', '[SEP]', ..., '[SEP]', 'mercy', '[SEP]'] + """ + + labels_mask = [] + labels = [] + if tags is None: + hyp_tokens, token_start_indices = self._split_to_wordpieces(hyp.split()) + else: + hyp_tokens, labels, token_start_indices = self._split_to_wordpieces_with_labels(hyp.split(), tags) + references = ref.split(";") + all_ref_tokens = [] + all_ref_segment_ids = [] + for i in range(len(references)): + ref_tokens, _ = self._split_to_wordpieces(references[i].split()) + all_ref_tokens.extend(ref_tokens + ["[SEP]"]) + all_ref_segment_ids.extend([i + 1] * (len(ref_tokens) + 1)) + + input_tokens = ["[CLS]"] + hyp_tokens + ["[SEP]"] + all_ref_tokens # ends with [SEP] + input_ids = self._tokenizer.convert_tokens_to_ids(input_tokens) + input_mask = [1] * len(input_ids) + segment_ids = [0] + [0] * len(hyp_tokens) + [0] + all_ref_segment_ids + if len(input_ids) != len(segment_ids): + raise ValueError( + "len(input_ids)=" + + str(len(input_ids)) + + " is different from len(segment_ids)=" + + str(len(segment_ids)) + ) + + if tags: + labels_mask = [0] + [1] * len(labels) + [0] + [0] * len(all_ref_tokens) + labels = [0] + labels + [0] + [0] * len(all_ref_tokens) + return (input_ids, input_mask, segment_ids, labels_mask, labels, hyp_tokens, token_start_indices) + + def _split_to_wordpieces_with_labels( + self, tokens: List[str], labels: List[int] + ) -> Tuple[List[str], List[int], List[int]]: + """Splits tokens (and the labels accordingly) to WordPieces. + + Args: + tokens: Tokens to be split. + labels: Labels (one per token) to be split. + + Returns: + 3-tuple with the split tokens, split labels, and the indices of starting tokens of words + """ + bert_tokens = [] # Original tokens split into wordpieces. + bert_labels = [] # Label for each wordpiece. + # Index of each wordpiece that starts a new token. + token_start_indices = [] + for i, token in enumerate(tokens): + # '+ 1' is because bert_tokens will be prepended by [CLS] token later. + token_start_indices.append(len(bert_tokens) + 1) + pieces = self._tokenizer.tokenize(token) + bert_tokens.extend(pieces) + bert_labels.extend([labels[i]] * len(pieces)) + return bert_tokens, bert_labels, token_start_indices + + def _split_to_wordpieces(self, tokens: List[str]) -> Tuple[List[str], List[int]]: + """Splits tokens to WordPieces. + + Args: + tokens: Tokens to be split. + + Returns: + tuple with the split tokens, and the indices of the WordPieces that start a token. + """ + bert_tokens = [] # Original tokens split into wordpieces. + # Index of each wordpiece that starts a new token. + token_start_indices = [] + for i, token in enumerate(tokens): + # '+ 1' is because bert_tokens will be prepended by [CLS] token later. + token_start_indices.append(len(bert_tokens) + 1) + pieces = self._tokenizer.tokenize(token) + bert_tokens.extend(pieces) + return bert_tokens, token_start_indices + + def read_input_file( + self, input_filename: str, infer: bool = False + ) -> Union[List['BertExample'], Tuple[List['BertExample'], Tuple[str, str]]]: + """Reads in Tab Separated Value file and converts to training/inference-ready examples. + + Args: + example_builder: Instance of BertExampleBuilder + input_filename: Path to the TSV input file. + infer: If true, input examples do not contain target info. + + Returns: + examples: List of converted examples (BertExample). + or + (examples, hyps_refs): If infer==true, returns h + """ + + if not path.exists(input_filename): + raise ValueError("Cannot find file: " + input_filename) + examples = [] # output list of BertExample + hyps_refs = [] # output list of tuples (ASR-hypothesis, candidate_str) + with open(input_filename, 'r') as f: + for line in f: + if len(examples) % 1000 == 0: + logging.info("{} examples processed.".format(len(examples))) + if infer: + parts = line.rstrip('\n').split('\t') + hyp, ref, target, span_info = parts[0], parts[1], None, None + if len(parts) == 4: + target, span_info = parts[2], parts[3] + try: + example = self.build_bert_example(hyp, ref, target=target, span_info=span_info, infer=infer) + except Exception as e: + logging.warning(str(e)) + logging.warning(line) + continue + if example is None: + logging.info("cannot create example: ") + logging.info(line) + continue + hyps_refs.append((hyp, ref)) + examples.append(example) + else: + hyp, ref, target, semiotic_info = line.rstrip('\n').split('\t') + try: + example = self.build_bert_example( + hyp, ref, target=target, span_info=semiotic_info, infer=infer + ) + except Exception as e: + logging.warning(str(e)) + logging.warning(line) + continue + if example is None: + logging.info("cannot create example: ") + logging.info(line) + continue + examples.append(example) + logging.info(f"Done. {len(examples)} examples converted.") + if infer: + return examples, hyps_refs + return examples diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/dataset.py b/nemo/collections/nlp/data/spellchecking_asr_customization/dataset.py new file mode 100644 index 000000000000..69705ec21b9d --- /dev/null +++ b/nemo/collections/nlp/data/spellchecking_asr_customization/dataset.py @@ -0,0 +1,521 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pickle +from io import BytesIO +from typing import Dict, List, Optional, Tuple + +import braceexpand +import numpy as np +import torch +import webdataset as wd + +from nemo.collections.nlp.data.spellchecking_asr_customization.bert_example import BertExampleBuilder +from nemo.core.classes.dataset import Dataset, IterableDataset +from nemo.core.neural_types import ChannelType, IntType, LabelsType, MaskType, NeuralType +from nemo.utils import logging + +__all__ = [ + "SpellcheckingAsrCustomizationDataset", + "SpellcheckingAsrCustomizationTestDataset", + "TarredSpellcheckingAsrCustomizationDataset", +] + + +def collate_train_dataset( + batch: List[ + Tuple[ + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + ] + ], + pad_token_id: int, +) -> Tuple[ + torch.LongTensor, + torch.LongTensor, + torch.LongTensor, + torch.LongTensor, + torch.LongTensor, + torch.LongTensor, + torch.LongTensor, + torch.LongTensor, + torch.LongTensor, + torch.LongTensor, +]: + """collate batch of training items + Args: + batch: A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos, labels_mask, labels, spans). + pad_token_id: integer id of padding token (to use in padded_input_ids, padded_input_ids_for_subwords) + """ + max_length = 0 + max_length_for_subwords = 0 + max_length_for_spans = 1 # to avoid empty tensor + for ( + input_ids, + input_mask, + segment_ids, + input_ids_for_subwords, + input_mask_for_subwords, + segment_ids_for_subwords, + character_pos_to_subword_pos, + labels_mask, + labels, + spans, + ) in batch: + if len(input_ids) > max_length: + max_length = len(input_ids) + if len(input_ids_for_subwords) > max_length_for_subwords: + max_length_for_subwords = len(input_ids_for_subwords) + if len(spans) > max_length_for_spans: + max_length_for_spans = len(spans) + + padded_input_ids = [] + padded_input_mask = [] + padded_segment_ids = [] + padded_input_ids_for_subwords = [] + padded_input_mask_for_subwords = [] + padded_segment_ids_for_subwords = [] + padded_character_pos_to_subword_pos = [] + padded_labels_mask = [] + padded_labels = [] + padded_spans = [] + for ( + input_ids, + input_mask, + segment_ids, + input_ids_for_subwords, + input_mask_for_subwords, + segment_ids_for_subwords, + character_pos_to_subword_pos, + labels_mask, + labels, + spans, + ) in batch: + if len(input_ids) < max_length: + pad_length = max_length - len(input_ids) + padded_input_ids.append(np.pad(input_ids, pad_width=[0, pad_length], constant_values=pad_token_id)) + padded_input_mask.append(np.pad(input_mask, pad_width=[0, pad_length], constant_values=0)) + padded_segment_ids.append(np.pad(segment_ids, pad_width=[0, pad_length], constant_values=0)) + padded_labels_mask.append(np.pad(labels_mask, pad_width=[0, pad_length], constant_values=0)) + padded_labels.append(np.pad(labels, pad_width=[0, pad_length], constant_values=0)) + padded_character_pos_to_subword_pos.append( + np.pad(character_pos_to_subword_pos, pad_width=[0, pad_length], constant_values=0) + ) + else: + padded_input_ids.append(input_ids) + padded_input_mask.append(input_mask) + padded_segment_ids.append(segment_ids) + padded_labels_mask.append(labels_mask) + padded_labels.append(labels) + padded_character_pos_to_subword_pos.append(character_pos_to_subword_pos) + + if len(input_ids_for_subwords) < max_length_for_subwords: + pad_length = max_length_for_subwords - len(input_ids_for_subwords) + padded_input_ids_for_subwords.append( + np.pad(input_ids_for_subwords, pad_width=[0, pad_length], constant_values=pad_token_id) + ) + padded_input_mask_for_subwords.append( + np.pad(input_mask_for_subwords, pad_width=[0, pad_length], constant_values=0) + ) + padded_segment_ids_for_subwords.append( + np.pad(segment_ids_for_subwords, pad_width=[0, pad_length], constant_values=0) + ) + else: + padded_input_ids_for_subwords.append(input_ids_for_subwords) + padded_input_mask_for_subwords.append(input_mask_for_subwords) + padded_segment_ids_for_subwords.append(segment_ids_for_subwords) + + if len(spans) < max_length_for_spans: + padded_spans.append(np.ones((max_length_for_spans, 3), dtype=int) * -1) # pad value is [-1, -1, -1] + if len(spans) > 0: + padded_spans[-1][: spans.shape[0], : spans.shape[1]] = spans # copy actual spans to the beginning + else: + padded_spans.append(spans) + + return ( + torch.LongTensor(np.array(padded_input_ids)), + torch.LongTensor(np.array(padded_input_mask)), + torch.LongTensor(np.array(padded_segment_ids)), + torch.LongTensor(np.array(padded_input_ids_for_subwords)), + torch.LongTensor(np.array(padded_input_mask_for_subwords)), + torch.LongTensor(np.array(padded_segment_ids_for_subwords)), + torch.LongTensor(np.array(padded_character_pos_to_subword_pos)), + torch.LongTensor(np.array(padded_labels_mask)), + torch.LongTensor(np.array(padded_labels)), + torch.LongTensor(np.array(padded_spans)), + ) + + +def collate_test_dataset( + batch: List[Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]], + pad_token_id: int, +) -> Tuple[ + torch.LongTensor, + torch.LongTensor, + torch.LongTensor, + torch.LongTensor, + torch.LongTensor, + torch.LongTensor, + torch.LongTensor, + torch.LongTensor, +]: + """collate batch of test items + Args: + batch: A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos, fragment_indices). + pad_token_id: integer id of padding token (to use in padded_input_ids, padded_input_ids_for_subwords) + """ + max_length = 0 + max_length_for_subwords = 0 + max_length_for_fragment_indices = 1 # to avoid empty tensor + for ( + input_ids, + input_mask, + segment_ids, + input_ids_for_subwords, + input_mask_for_subwords, + segment_ids_for_subwords, + character_pos_to_subword_pos, + fragment_indices, + ) in batch: + if len(input_ids) > max_length: + max_length = len(input_ids) + if len(input_ids_for_subwords) > max_length_for_subwords: + max_length_for_subwords = len(input_ids_for_subwords) + if len(fragment_indices) > max_length_for_fragment_indices: + max_length_for_fragment_indices = len(fragment_indices) + + padded_input_ids = [] + padded_input_mask = [] + padded_segment_ids = [] + padded_input_ids_for_subwords = [] + padded_input_mask_for_subwords = [] + padded_segment_ids_for_subwords = [] + padded_character_pos_to_subword_pos = [] + padded_fragment_indices = [] + for ( + input_ids, + input_mask, + segment_ids, + input_ids_for_subwords, + input_mask_for_subwords, + segment_ids_for_subwords, + character_pos_to_subword_pos, + fragment_indices, + ) in batch: + if len(input_ids) < max_length: + pad_length = max_length - len(input_ids) + padded_input_ids.append(np.pad(input_ids, pad_width=[0, pad_length], constant_values=pad_token_id)) + padded_input_mask.append(np.pad(input_mask, pad_width=[0, pad_length], constant_values=0)) + padded_segment_ids.append(np.pad(segment_ids, pad_width=[0, pad_length], constant_values=0)) + padded_character_pos_to_subword_pos.append( + np.pad(character_pos_to_subword_pos, pad_width=[0, pad_length], constant_values=0) + ) + else: + padded_input_ids.append(input_ids) + padded_input_mask.append(input_mask) + padded_segment_ids.append(segment_ids) + padded_character_pos_to_subword_pos.append(character_pos_to_subword_pos) + + if len(input_ids_for_subwords) < max_length_for_subwords: + pad_length = max_length_for_subwords - len(input_ids_for_subwords) + padded_input_ids_for_subwords.append( + np.pad(input_ids_for_subwords, pad_width=[0, pad_length], constant_values=pad_token_id) + ) + padded_input_mask_for_subwords.append( + np.pad(input_mask_for_subwords, pad_width=[0, pad_length], constant_values=0) + ) + padded_segment_ids_for_subwords.append( + np.pad(segment_ids_for_subwords, pad_width=[0, pad_length], constant_values=0) + ) + else: + padded_input_ids_for_subwords.append(input_ids_for_subwords) + padded_input_mask_for_subwords.append(input_mask_for_subwords) + padded_segment_ids_for_subwords.append(segment_ids_for_subwords) + + if len(fragment_indices) < max_length_for_fragment_indices: + # we use [0, 1, 0] as padding value for fragment_indices, it corresponds to [CLS] token, which is ignored and won't affect anything + p = np.zeros((max_length_for_fragment_indices, 3), dtype=int) + p[:, 1] = 1 + p[:, 2] = 0 + padded_fragment_indices.append(p) + if len(fragment_indices) > 0: + padded_fragment_indices[-1][ + : fragment_indices.shape[0], : fragment_indices.shape[1] + ] = fragment_indices # copy actual fragment_indices to the beginning + else: + padded_fragment_indices.append(fragment_indices) + + return ( + torch.LongTensor(np.array(padded_input_ids)), + torch.LongTensor(np.array(padded_input_mask)), + torch.LongTensor(np.array(padded_segment_ids)), + torch.LongTensor(np.array(padded_input_ids_for_subwords)), + torch.LongTensor(np.array(padded_input_mask_for_subwords)), + torch.LongTensor(np.array(padded_segment_ids_for_subwords)), + torch.LongTensor(np.array(padded_character_pos_to_subword_pos)), + torch.LongTensor(np.array(padded_fragment_indices)), + ) + + +class SpellcheckingAsrCustomizationDataset(Dataset): + """ + Dataset as used by the SpellcheckingAsrCustomizationModel for training and validation pipelines. + + Args: + input_file (str): path to tsv-file with data + example_builder: instance of BertExampleBuilder + """ + + @property + def output_types(self) -> Optional[Dict[str, NeuralType]]: + """Returns definitions of module output ports. + """ + return { + "input_ids": NeuralType(('B', 'T'), ChannelType()), + "input_mask": NeuralType(('B', 'T'), MaskType()), + "segment_ids": NeuralType(('B', 'T'), ChannelType()), + "input_ids_for_subwords": NeuralType(('B', 'T'), ChannelType()), + "input_mask_for_subwords": NeuralType(('B', 'T'), MaskType()), + "segment_ids_for_subwords": NeuralType(('B', 'T'), ChannelType()), + "character_pos_to_subword_pos": NeuralType(('B', 'T'), ChannelType()), + "labels_mask": NeuralType(('B', 'T'), MaskType()), + "labels": NeuralType(('B', 'T'), LabelsType()), + "spans": NeuralType(('B', 'T', 'C'), IntType()), + } + + def __init__(self, input_file: str, example_builder: BertExampleBuilder) -> None: + self.example_builder = example_builder + self.examples = self.example_builder.read_input_file(input_file, infer=False) + self.pad_token_id = self.example_builder._pad_id + + def __len__(self): + return len(self.examples) + + def __getitem__(self, idx: int): + example = self.examples[idx] + input_ids = np.array(example.features["input_ids"], dtype=np.int16) + input_mask = np.array(example.features["input_mask"], dtype=np.int8) + segment_ids = np.array(example.features["segment_ids"], dtype=np.int8) + input_ids_for_subwords = np.array(example.features["input_ids_for_subwords"], dtype=np.int16) + input_mask_for_subwords = np.array(example.features["input_mask_for_subwords"], dtype=np.int8) + segment_ids_for_subwords = np.array(example.features["segment_ids_for_subwords"], dtype=np.int8) + character_pos_to_subword_pos = np.array(example.features["character_pos_to_subword_pos"], dtype=np.int16) + labels_mask = np.array(example.features["labels_mask"], dtype=np.int8) + labels = np.array(example.features["labels"], dtype=np.int8) + spans = np.array(example.features["spans"], dtype=np.int16) + return ( + input_ids, + input_mask, + segment_ids, + input_ids_for_subwords, + input_mask_for_subwords, + segment_ids_for_subwords, + character_pos_to_subword_pos, + labels_mask, + labels, + spans, + ) + + def _collate_fn(self, batch): + """collate batch of items + Args: + batch: A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos, labels_mask, labels, spans). + """ + return collate_train_dataset(batch, pad_token_id=self.pad_token_id) + + +class TarredSpellcheckingAsrCustomizationDataset(IterableDataset): + """ + This Dataset loads training examples from tarred tokenized pickle files. + If using multiple processes the number of shards should be divisible by the number of workers to ensure an + even split among workers. If it is not divisible, logging will give a warning but training will proceed. + Additionally, please note that the len() of this DataLayer is assumed to be the number of tokens + of the text data. Shard strategy is scatter - each node gets a unique set of shards, which are permanently + pre-allocated and never changed at runtime. + Args: + text_tar_filepaths: a string (can be brace-expandable). + shuffle_n (int): How many samples to look ahead and load to be shuffled. + See WebDataset documentation for more details. + Defaults to 0. + global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. + world_size (int): Total number of processes, used for partitioning shards. Defaults to 1. + pad_token_id: id of pad token (used in collate_fn) + """ + + def __init__( + self, + text_tar_filepaths: str, + shuffle_n: int = 1, + global_rank: int = 0, + world_size: int = 1, + pad_token_id: int = -1, # use real value or get error + ): + super(TarredSpellcheckingAsrCustomizationDataset, self).__init__() + if pad_token_id < 0: + raise ValueError("use non-negative pad_token_id: " + str(pad_token_id)) + + self.pad_token_id = pad_token_id + + # Replace '(', '[', '<' and '_OP_' with '{' + brace_keys_open = ['(', '[', '<', '_OP_'] + for bkey in brace_keys_open: + if bkey in text_tar_filepaths: + text_tar_filepaths = text_tar_filepaths.replace(bkey, "{") + + # Replace ')', ']', '>' and '_CL_' with '}' + brace_keys_close = [')', ']', '>', '_CL_'] + for bkey in brace_keys_close: + if bkey in text_tar_filepaths: + text_tar_filepaths = text_tar_filepaths.replace(bkey, "}") + + # Brace expand + text_tar_filepaths = list(braceexpand.braceexpand(text_tar_filepaths)) + + logging.info("Tarred dataset shards will be scattered evenly across all nodes.") + if len(text_tar_filepaths) % world_size != 0: + logging.warning( + f"Number of shards in tarred dataset ({len(text_tar_filepaths)}) is not divisible " + f"by number of distributed workers ({world_size}). " + f"Some shards will not be used ({len(text_tar_filepaths) % world_size})." + ) + begin_idx = (len(text_tar_filepaths) // world_size) * global_rank + end_idx = begin_idx + (len(text_tar_filepaths) // world_size) + logging.info('Begin Index : %d' % (begin_idx)) + logging.info('End Index : %d' % (end_idx)) + text_tar_filepaths = text_tar_filepaths[begin_idx:end_idx] + logging.info( + "Partitioning tarred dataset: process (%d) taking shards [%d, %d)", global_rank, begin_idx, end_idx + ) + + self.tarpath = text_tar_filepaths + + # Put together WebDataset + self._dataset = wd.WebDataset(urls=text_tar_filepaths, nodesplitter=None) + + if shuffle_n > 0: + self._dataset = self._dataset.shuffle(shuffle_n, initial=shuffle_n) + else: + logging.info("WebDataset will not shuffle files within the tar files.") + + self._dataset = self._dataset.rename(pkl='pkl', key='__key__').to_tuple('pkl', 'key').map(f=self._build_sample) + + def _build_sample(self, fname): + # Load file + pkl_file, _ = fname + pkl_file = BytesIO(pkl_file) + data = pickle.load(pkl_file) + pkl_file.close() + input_ids = data["input_ids"] + input_mask = data["input_mask"] + segment_ids = data["segment_ids"] + input_ids_for_subwords = data["input_ids_for_subwords"] + input_mask_for_subwords = data["input_mask_for_subwords"] + segment_ids_for_subwords = data["segment_ids_for_subwords"] + character_pos_to_subword_pos = data["character_pos_to_subword_pos"] + labels_mask = data["labels_mask"] + labels = data["labels"] + spans = data["spans"] + + return ( + input_ids, + input_mask, + segment_ids, + input_ids_for_subwords, + input_mask_for_subwords, + segment_ids_for_subwords, + character_pos_to_subword_pos, + labels_mask, + labels, + spans, + ) + + def __iter__(self): + return self._dataset.__iter__() + + def _collate_fn(self, batch): + """collate batch of items + Args: + batch: A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos, labels_mask, labels, spans). + """ + return collate_train_dataset(batch, pad_token_id=self.pad_token_id) + + +class SpellcheckingAsrCustomizationTestDataset(Dataset): + """ + Dataset for inference pipeline. + + Args: + sents: list of strings + example_builder: instance of BertExampleBuilder + """ + + @property + def output_types(self) -> Optional[Dict[str, NeuralType]]: + """Returns definitions of module output ports. + """ + return { + "input_ids": NeuralType(('B', 'T'), ChannelType()), + "input_mask": NeuralType(('B', 'T'), MaskType()), + "segment_ids": NeuralType(('B', 'T'), ChannelType()), + "input_ids_for_subwords": NeuralType(('B', 'T'), ChannelType()), + "input_mask_for_subwords": NeuralType(('B', 'T'), MaskType()), + "segment_ids_for_subwords": NeuralType(('B', 'T'), ChannelType()), + "character_pos_to_subword_pos": NeuralType(('B', 'T'), ChannelType()), + "fragment_indices": NeuralType(('B', 'T', 'C'), IntType()), + } + + def __init__(self, input_file: str, example_builder: BertExampleBuilder) -> None: + self.example_builder = example_builder + self.examples, self.hyps_refs = self.example_builder.read_input_file(input_file, infer=True) + self.pad_token_id = self.example_builder._pad_id + + def __len__(self): + return len(self.examples) + + def __getitem__(self, idx: int): + example = self.examples[idx] + input_ids = np.array(example.features["input_ids"]) + input_mask = np.array(example.features["input_mask"]) + segment_ids = np.array(example.features["segment_ids"]) + input_ids_for_subwords = np.array(example.features["input_ids_for_subwords"]) + input_mask_for_subwords = np.array(example.features["input_mask_for_subwords"]) + segment_ids_for_subwords = np.array(example.features["segment_ids_for_subwords"]) + character_pos_to_subword_pos = np.array(example.features["character_pos_to_subword_pos"], dtype=np.int64) + fragment_indices = np.array(example.features["fragment_indices"], dtype=np.int16) + return ( + input_ids, + input_mask, + segment_ids, + input_ids_for_subwords, + input_mask_for_subwords, + segment_ids_for_subwords, + character_pos_to_subword_pos, + fragment_indices, + ) + + def _collate_fn(self, batch): + """collate batch of items + Args: + batch: A list of tuples of (input_ids, input_mask, segment_ids, input_ids_for_subwords, input_mask_for_subwords, segment_ids_for_subwords, character_pos_to_subword_pos). + """ + return collate_test_dataset(batch, pad_token_id=self.pad_token_id) diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/utils.py b/nemo/collections/nlp/data/spellchecking_asr_customization/utils.py new file mode 100644 index 000000000000..cda551189d78 --- /dev/null +++ b/nemo/collections/nlp/data/spellchecking_asr_customization/utils.py @@ -0,0 +1,845 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import math +import random +import re +from collections import defaultdict, namedtuple +from typing import Dict, List, Set, Tuple, Union + +import numpy as np +from numba import jit + +"""Utility functions for Spellchecking ASR Customization.""" + + +def replace_diacritics(text): + text = re.sub(r"[éèëēêęěė]", "e", text) # latin + text = re.sub(r"[ё]", "е", text) # cyrillic + text = re.sub(r"[ãâāáäăàąåạảǎ]", "a", text) + text = re.sub(r"[úūüùưûů]", "u", text) + text = re.sub(r"[ôōóöõòőø]", "o", text) + text = re.sub(r"[ćçč]", "c", text) + text = re.sub(r"[ïīíîıì]", "i", text) + text = re.sub(r"[ñńňņ]", "n", text) + text = re.sub(r"[țťţ]", "t", text) + text = re.sub(r"[łľļ]", "l", text) + text = re.sub(r"[żžź]", "z", text) + text = re.sub(r"[ğ]", "g", text) + text = re.sub(r"[ďđ]", "d", text) + text = re.sub(r"[ķ]", "k", text) + text = re.sub(r"[ř]", "r", text) + text = re.sub(r"[ý]", "y", text) + text = re.sub(r"[æ]", "ae", text) + text = re.sub(r"[œ]", "oe", text) + text = re.sub(r"[șşšś]", "s", text) + return text + + +def load_ngram_mappings(input_name: str, max_misspelled_freq: int = 1000000000) -> Tuple[defaultdict, Set]: + """Loads n-gram mapping vocabularies in form required by dynamic programming + Args: + input_name: file with n-gram mappings + max_misspelled_freq: threshold on misspelled n-gram frequency + Returns: + vocab: dict {key=original_ngram, value=dict{key=misspelled_ngram, value=frequency}} + ban_ngram: set of banned misspelled n-grams + + Input format: + u t o u+i t o 49 8145 114 + u t o t e 63 8145 16970 + u t o o+_ t o 42 8145 1807 + """ + vocab = defaultdict(dict) + ban_ngram = set() + + with open(input_name, "r", encoding="utf-8") as f: + for line in f: + orig, misspelled, joint_freq, orig_freq, misspelled_freq = line.strip().split("\t") + if orig == "" or misspelled == "": + raise ValueError("Empty n-gram: orig=" + orig + "; misspelled=" + misspelled) + misspelled = misspelled.replace("", "=") + if misspelled.replace("=", "").strip() == "": # skip if resulting ngram doesn't contain any real character + continue + if int(misspelled_freq) > max_misspelled_freq: + ban_ngram.add(misspelled + " ") # space at the end is required within get_index function + vocab[orig][misspelled] = int(joint_freq) / int(orig_freq) + return vocab, ban_ngram + + +def load_ngram_mappings_for_dp(input_name: str) -> Tuple[defaultdict, defaultdict, defaultdict, int]: + """Loads n-gram mapping vocabularies in form required by dynamic programming + Args: + input_name: file with n-gram mappings + Returns: + joint_vocab: dict where key=(original_ngram, misspelled_ngram), value=frequency + orig_vocab: dict where key=original_ngram, value=frequency + misspelled_vocab: dict where key=misspelled_ngram, value=frequency + max_len: maximum n-gram length seen in vocabulary + + Input format: original \t misspelled \t joint_freq \t original_freq \t misspelled_freq + u t o u+i t o 49 8145 114 + u t o t e 63 8145 16970 + u t o o+_ t o 42 8145 1807 + """ + joint_vocab = defaultdict(int) + orig_vocab = defaultdict(int) + misspelled_vocab = defaultdict(int) + max_len = 0 + with open(input_name, "r", encoding="utf-8") as f: + for line in f: + orig, misspelled, joint_freq, _, _ = line.strip().split("\t") + if orig == "" or misspelled == "": + raise ValueError("Emty n-gram: orig=" + orig + "; misspelled=" + misspelled) + misspelled = misspelled.replace("", " ").replace("+", " ") + misspelled = " ".join(misspelled.split()) + if misspelled == "": # skip if resulting ngram doesn't contain any real character + continue + max_len = max(max_len, orig.count(" ") + 1, misspelled.count(" ") + 1) + joint_vocab[(orig, misspelled)] += int(joint_freq) + orig_vocab[orig] += int(joint_freq) + misspelled_vocab[misspelled] += int(joint_freq) + return joint_vocab, orig_vocab, misspelled_vocab, max_len + + +def get_alignment_by_dp( + ref_phrase: str, hyp_phrase: str, dp_data: Tuple[defaultdict, defaultdict, defaultdict, int] +) -> List[Tuple[str, str, float, float, int, int, int]]: + """Get best alignment path between a reference and (possibly) misspelled phrase using n-gram mappings vocabulary. + Args: + ref_phrase: candidate reference phrase (letters separated by space, real space replaced by underscore) + hyp_phrase: (possibly) misspelled phrase (letters separated by space, real space replaced by underscore) + dp_data: n-gram mapping vocabularies used by dynamic programming + Returns: + list of tuples (hyp_ngram, ref_ngram, logprob, sum_logprob, joint_freq, orig_freq, misspelled_freq) + This is best alignment path. + + Example: + ref_phrase: "a n h y d r i d e" + hyp_phrase: "a n d _ h y d r o d" + + Result: + [("*", "*", 0.0, 0.0, 0, 0, 0) + ("a n d _ h", "a n h", -2.34, -2.34, 226, 2338, 2203) + ("y d r o", "y d r i", -2.95, -5.29, 11, 211, 1584) + ("d", "d e", -1.99, -7.28, 60610, 444714, 2450334) + ] + Final path score is in path[-1][3]: -7.28 + Note that the order of ref_phrase and hyp_phrase matters, because n-gram mappings vocabulary is not symmetrical. + """ + joint_vocab, orig_vocab, misspelled_vocab, max_len = dp_data + hyp_letters = ["*"] + hyp_phrase.split() + ref_letters = ["*"] + ref_phrase.split() + DpInfo = namedtuple( + "DpInfo", ["hyp_pos", "ref_pos", "best_hyp_ngram_len", "best_ref_ngram_len", "score", "sum_score"] + ) + history = defaultdict(DpInfo) + history[(0, 0)] = DpInfo( + hyp_pos=0, ref_pos=0, best_hyp_ngram_len=1, best_ref_ngram_len=1, score=0.0, sum_score=0.0 + ) + for hyp_pos in range(len(hyp_letters)): + for ref_pos in range(len(ref_letters)): + if hyp_pos == 0 and ref_pos == 0: # cell (0, 0) is already defined + continue + # consider cell (hyp_pos, ref_pos) and find best path to get there + best_hyp_ngram_len = 0 + best_ref_ngram_len = 0 + best_ngram_score = float("-inf") + best_sum_score = float("-inf") + # loop over paths ending on non-empty ngram mapping + for hyp_ngram_len in range(1, 1 + min(max_len, hyp_pos + 1)): + hyp_ngram = " ".join(hyp_letters[(hyp_pos - hyp_ngram_len + 1) : (hyp_pos + 1)]) + for ref_ngram_len in range(1, 1 + min(max_len, ref_pos + 1)): + ref_ngram = " ".join(ref_letters[(ref_pos - ref_ngram_len + 1) : (ref_pos + 1)]) + if (ref_ngram, hyp_ngram) not in joint_vocab: + continue + joint_freq = joint_vocab[(ref_ngram, hyp_ngram)] + orig_freq = orig_vocab.get(ref_ngram, 1) + ngram_score = math.log(joint_freq / orig_freq) + previous_cell = (hyp_pos - hyp_ngram_len, ref_pos - ref_ngram_len) + if previous_cell not in history: + print("cell ", previous_cell, "does not exist") + continue + previous_score = history[previous_cell].sum_score + sum_score = ngram_score + previous_score + if sum_score > best_sum_score: + best_sum_score = sum_score + best_ngram_score = ngram_score + best_hyp_ngram_len = hyp_ngram_len + best_ref_ngram_len = ref_ngram_len + # loop over two variants with deletion of one character + deletion_score = -6.0 + insertion_score = -6.0 + if hyp_pos > 0: + previous_cell = (hyp_pos - 1, ref_pos) + previous_score = history[previous_cell].sum_score + sum_score = deletion_score + previous_score + if sum_score > best_sum_score: + best_sum_score = sum_score + best_ngram_score = deletion_score + best_hyp_ngram_len = 1 + best_ref_ngram_len = 0 + + if ref_pos > 0: + previous_cell = (hyp_pos, ref_pos - 1) + previous_score = history[previous_cell].sum_score + sum_score = insertion_score + previous_score + if sum_score > best_sum_score: + best_sum_score = sum_score + best_ngram_score = insertion_score + best_hyp_ngram_len = 0 + best_ref_ngram_len = 1 + + if best_hyp_ngram_len == 0 and best_ref_ngram_len == 0: + raise ValueError("best_hyp_ngram_len = 0 and best_ref_ngram_len = 0") + + # save cell to history + history[(hyp_pos, ref_pos)] = DpInfo( + hyp_pos=hyp_pos, + ref_pos=ref_pos, + best_hyp_ngram_len=best_hyp_ngram_len, + best_ref_ngram_len=best_ref_ngram_len, + score=best_ngram_score, + sum_score=best_sum_score, + ) + # now trace back on best path starting from last positions + path = [] + hyp_pos = len(hyp_letters) - 1 + ref_pos = len(ref_letters) - 1 + cell_info = history[(hyp_pos, ref_pos)] + path.append(cell_info) + while hyp_pos > 0 or ref_pos > 0: + hyp_pos -= cell_info.best_hyp_ngram_len + ref_pos -= cell_info.best_ref_ngram_len + cell_info = history[(hyp_pos, ref_pos)] + path.append(cell_info) + + result = [] + for info in reversed(path): + hyp_ngram = " ".join(hyp_letters[(info.hyp_pos - info.best_hyp_ngram_len + 1) : (info.hyp_pos + 1)]) + ref_ngram = " ".join(ref_letters[(info.ref_pos - info.best_ref_ngram_len + 1) : (info.ref_pos + 1)]) + joint_freq = joint_vocab.get((ref_ngram, hyp_ngram), 0) + orig_freq = orig_vocab.get(ref_ngram, 0) + misspelled_freq = misspelled_vocab.get(hyp_ngram, 0) + result.append((hyp_ngram, ref_ngram, info.score, info.sum_score, joint_freq, orig_freq, misspelled_freq)) + return result + + +def get_index( + custom_phrases: List[str], + vocab: defaultdict, + ban_ngram_global: Set[str], + min_log_prob: float = -4.0, + max_phrases_per_ngram: int = 100, +) -> Tuple[List[str], Dict[str, List[Tuple[int, int, int, float]]]]: + """Given a restricted vocabulary of replacements, + loops through custom phrases, + generates all possible conversions and creates index. + + Args: + custom_phrases: list of all custom phrases, characters should be split by space, real space replaced to underscore. + vocab: n-gram mappings vocabulary - dict {key=original_ngram, value=dict{key=misspelled_ngram, value=frequency}} + ban_ngram_global: set of banned misspelled n-grams + min_log_prob: minimum log probability, after which we stop growing this n-gram. + max_phrases_per_ngram: maximum phrases that we allow to store per one n-gram. N-grams exceeding that quantity get banned. + + Returns: + phrases - list of phrases. Position in this list is used as phrase_id. + ngram2phrases - resulting index, i.e. dict where key=ngram, value=list of tuples (phrase_id, begin_pos, size, logprob) + """ + + ban_ngram_local = set() # these ngrams are banned only for given custom_phrases + ngram_to_phrase_and_position = defaultdict(list) + + for custom_phrase in custom_phrases: + inputs = custom_phrase.split(" ") + begin = 0 + index_keys = [{} for _ in inputs] # key - letter ngram, index - beginning positions in phrase + + for begin in range(len(inputs)): + for end in range(begin + 1, min(len(inputs) + 1, begin + 5)): + inp = " ".join(inputs[begin:end]) + if inp not in vocab: + continue + for rep in vocab[inp]: + lp = math.log(vocab[inp][rep]) + + for b in range(max(0, end - 5), end): # try to grow previous ngrams with new replacement + new_ngrams = {} + for ngram in index_keys[b]: + lp_prev = index_keys[b][ngram] + if len(ngram) + len(rep) <= 10 and b + ngram.count(" ") == begin: + if lp_prev + lp > min_log_prob: + new_ngrams[ngram + rep + " "] = lp_prev + lp + index_keys[b].update(new_ngrams) # join two dictionaries + # add current replacement as ngram + if lp > min_log_prob: + index_keys[begin][rep + " "] = lp + + for b in range(len(index_keys)): + for ngram, lp in sorted(index_keys[b].items(), key=lambda item: item[1], reverse=True): + if ngram in ban_ngram_global: # here ngram ends with a space + continue + real_length = ngram.count(" ") + ngram = ngram.replace("+", " ").replace("=", " ") + ngram = " ".join(ngram.split()) # here ngram doesn't end with a space anymore + if ngram + " " in ban_ngram_global: # this can happen after deletion of + and = + continue + if ngram in ban_ngram_local: + continue + ngram_to_phrase_and_position[ngram].append((custom_phrase, b, real_length, lp)) + if len(ngram_to_phrase_and_position[ngram]) > max_phrases_per_ngram: + ban_ngram_local.add(ngram) + del ngram_to_phrase_and_position[ngram] + continue + + phrases = [] # id to phrase + phrase2id = {} # phrase to id + ngram2phrases = defaultdict(list) # ngram to list of tuples (phrase_id, begin, length, logprob) + + for ngram in ngram_to_phrase_and_position: + for phrase, b, length, lp in ngram_to_phrase_and_position[ngram]: + if phrase not in phrase2id: + phrases.append(phrase) + phrase2id[phrase] = len(phrases) - 1 + ngram2phrases[ngram].append((phrase2id[phrase], b, length, lp)) + + return phrases, ngram2phrases + + +def load_index(input_name: str) -> Tuple[List[str], Dict[str, List[Tuple[int, int, int, float]]]]: + """ Load index from file + Args: + input_name: file with index + Returns: + phrases: List of all phrases in custom vocabulary. Position corresponds to phrase_id. + ngram2phrases: dict where key=ngram, value=list of tuples (phrase_id, begin_pos, size, logprob) + """ + phrases = [] # id to phrase + phrase2id = {} # phrase to id + ngram2phrases = defaultdict(list) # ngram to list of tuples (phrase_id, begin_pos, size, logprob) + with open(input_name, "r", encoding="utf-8") as f: + for line in f: + ngram, phrase, b, size, lp = line.split("\t") + b = int(b) + size = int(size) + lp = float(lp) + if phrase not in phrase2id: + phrases.append(phrase) + phrase2id[phrase] = len(phrases) - 1 + ngram2phrases[ngram].append((phrase2id[phrase], b, size, lp)) + return phrases, ngram2phrases + + +def search_in_index( + ngram2phrases: Dict[str, List[Tuple[int, int, int, float]]], phrases: List[str], letters: Union[str, List[str]] +) -> Tuple[np.ndarray, List[Set[str]]]: + """ Function used to search in index + + Args: + ngram2phrases: dict where key=ngram, value=list of tuples (phrase_id, begin_pos, size, logprob) + phrases: List of all phrases in custom vocabulary. Position corresponds to phrase_id. + letters: list of letters of ASR-hypothesis. Should not contain spaces - real spaces should be replaced with underscores. + + Returns: + phrases2positions: a matrix of size (len(phrases), len(letters)). + It is filled with 1.0 (hits) on intersection of letter n-grams and phrases that are indexed by these n-grams, 0.0 - elsewhere. + It is used later to find phrases with many hits within a contiguous window - potential matching candidates. + position2ngrams: positions in ASR-hypothesis mapped to sets of ngrams starting from that position. + It is used later to check how well each found candidate is covered by n-grams (to avoid cases where some repeating n-gram gives many hits to a phrase, but the phrase itself is not well covered). + """ + + if " " in letters: + raise ValueError("letters should not contain space: " + str(letters)) + + phrases2positions = np.zeros((len(phrases), len(letters)), dtype=float) + # positions mapped to sets of ngrams starting from that position + position2ngrams = [set() for _ in range(len(letters))] + + begin = 0 + for begin in range(len(letters)): + for end in range(begin + 1, min(len(letters) + 1, begin + 7)): + ngram = " ".join(letters[begin:end]) + if ngram not in ngram2phrases: + continue + for phrase_id, b, size, lp in ngram2phrases[ngram]: + phrases2positions[phrase_id, begin:end] = 1.0 + position2ngrams[begin].add(ngram) + return phrases2positions, position2ngrams + + +@jit(nopython=True) # Set "nopython" mode for best performance, equivalent to @njit +def get_all_candidates_coverage(phrases, phrases2positions): + """Get maximum hit coverage for each phrase - within a moving window of length of the phrase. + Args: + phrases: List of all phrases in custom vocabulary. Position corresponds to phrase_id. + phrases2positions: a matrix of size (len(phrases), len(ASR-hypothesis)). + It is filled with 1.0 (hits) on intersection of letter n-grams and phrases that are indexed by these n-grams, 0.0 - elsewhere. + Returns: + candidate2coverage: list of size len(phrases) containing coverage (0.0 to 1.0) in best window. + candidate2position: list of size len(phrases) containing starting position of best window. + """ + candidate2coverage = [0.0] * len(phrases) + candidate2position = [-1] * len(phrases) + + for i in range(len(phrases)): + phrase_length = phrases[i].count(" ") + 1 + all_coverage = np.sum(phrases2positions[i]) / phrase_length + # if total coverage on whole ASR-hypothesis is too small, there is no sense in using moving window + if all_coverage < 0.4: + continue + moving_sum = np.sum(phrases2positions[i, 0:phrase_length]) + max_sum = moving_sum + best_pos = 0 + for pos in range(1, phrases2positions.shape[1] - phrase_length + 1): + moving_sum -= phrases2positions[i, pos - 1] + moving_sum += phrases2positions[i, pos + phrase_length - 1] + if moving_sum > max_sum: + max_sum = moving_sum + best_pos = pos + + coverage = max_sum / (phrase_length + 2) # smoothing + candidate2coverage[i] = coverage + candidate2position[i] = best_pos + return candidate2coverage, candidate2position + + +def get_candidates( + ngram2phrases: Dict[str, List[Tuple[int, int, int, float]]], + phrases: List[str], + letters: Union[str, List[str]], + pool_for_random_candidates: List[str], + min_phrase_coverage: float = 0.8, +) -> List[Tuple[str, int, int, float, float]]: + """Given an index of custom vocabulary and an ASR-hypothesis retrieve 10 candidates. + Args: + ngram2phrases: dict where key=ngram, value=list of tuples (phrase_id, begin_pos, size, logprob) + phrases: List of all phrases in custom vocabulary. Position corresponds to phrase_id. + letters: list of letters of ASR-hypothesis. Should not contain spaces - real spaces should be replaced with underscores. + pool_for_random_candidates: large list of strings, from which to sample random candidates in case when there are less than 10 real candidates + min_phrase_coverage: We discard candidates which are not covered by n-grams to at least to this extent + (to avoid cases where some repeating n-gram gives many hits to a phrase, but the phrase itself is not well covered). + Returns: + candidates: list of tuples (candidate_text, approximate_begin_position, length, coverage of window in ASR-hypothesis, coverage of phrase itself). + """ + phrases2positions, position2ngrams = search_in_index(ngram2phrases, phrases, letters) + candidate2coverage, candidate2position = get_all_candidates_coverage(phrases, phrases2positions) + + # mask for each custom phrase, how many which symbols are covered by input ngrams + phrases2coveredsymbols = [[0 for x in phrases[i].split(" ")] for i in range(len(phrases))] + candidates = [] + k = 0 + for idx, coverage in sorted(enumerate(candidate2coverage), key=lambda item: item[1], reverse=True): + begin = candidate2position[idx] # this is most likely beginning of this candidate + phrase_length = phrases[idx].count(" ") + 1 + for pos in range(begin, begin + phrase_length): + # we do not know exact end of custom phrase in text, it can be different from phrase length + if pos >= len(position2ngrams): + break + for ngram in position2ngrams[pos]: + for phrase_id, b, size, lp in ngram2phrases[ngram]: + if phrase_id != idx: + continue + for ppos in range(b, b + size): + if ppos >= phrase_length: + break + phrases2coveredsymbols[phrase_id][ppos] = 1 + k += 1 + if k > 100: + break + real_coverage = sum(phrases2coveredsymbols[idx]) / len(phrases2coveredsymbols[idx]) + if real_coverage < min_phrase_coverage: + continue + candidates.append((phrases[idx], begin, phrase_length, coverage, real_coverage)) + + # no need to process this sentence further if it does not contain any real candidates + if len(candidates) == 0: + print("WARNING: no real candidates", candidates) + return [] + + while len(candidates) < 10: + dummy = random.choice(pool_for_random_candidates) + dummy = " ".join(list(dummy.replace(" ", "_"))) + candidates.append((dummy, -1, dummy.count(" ") + 1, 0.0, 0.0)) + + candidates = candidates[:10] + random.shuffle(candidates) + if len(candidates) != 10: + print("WARNING: cannot get 10 candidates", candidates) + return [] + + return candidates + + +def read_spellmapper_predictions(filename: str) -> List[Tuple[str, List[Tuple[int, int, str, float]], List[int]]]: + """Read results of SpellMapper inference from file. + Args: + filename: file with SpellMapper results + Returns: + list of tuples (sent, list of fragment predictions, list of letter predictions) + One fragment prediction is a tuple (begin, end, replacement_text, prob) + """ + results = [] + with open(filename, "r", encoding="utf-8") as f: + for line in f: + text, candidate_str, fragment_predictions_str, letter_predictions_str = line.strip().split("\t") + text = text.replace(" ", "").replace("_", " ") + candidate_str = candidate_str.replace(" ", "").replace("_", " ") + candidates = candidate_str.split(";") + letter_predictions = list(map(int, letter_predictions_str.split())) + if len(candidates) != 10: + raise IndexError("expect 10 candidates, got: ", len(candidates)) + if len(text) != len(letter_predictions): + raise IndexError("len(text)=", len(text), "; len(letter_predictions)=", len(letter_predictions)) + replacements = [] + if fragment_predictions_str != "": + for prediction in fragment_predictions_str.split(";"): + begin, end, candidate_id, prob = prediction.split(" ") + begin = int(begin) + end = int(end) + candidate_id = int(candidate_id) + prob = float(prob) + replacements.append((begin, end, candidates[candidate_id - 1], prob)) + replacements.sort() # it will sort by begin, then by end + results.append((text, replacements, letter_predictions)) + return results + + +def substitute_replacements_in_text( + text: str, replacements: List[Tuple[int, int, str, float]], replace_hyphen_to_space: bool +) -> str: + """Substitute replacements to the input text, iterating from end to beginning, so that indexing does not change. + Note that we expect intersecting replacements to be already filtered. + Args: + text: sentence; + replacements: list of replacements, each is a tuple (begin, end, text, probability); + replace_hyphen_to_space: if True, hyphens in replacements will be converted to spaces; + Returns: + corrected sentence + """ + replacements.sort() + last_begin = len(text) + 1 + corrected_text = text + for begin, end, candidate, prob in reversed(replacements): + if end > last_begin: + print("WARNING: skip intersecting replacement [", candidate, "] in text: ", text) + continue + if replace_hyphen_to_space: + candidate = candidate.replace("-", " ") + corrected_text = corrected_text[:begin] + candidate + corrected_text[end:] + last_begin = begin + return corrected_text + + +def apply_replacements_to_text( + text: str, + replacements: List[Tuple[int, int, str, float]], + min_prob: float = 0.5, + replace_hyphen_to_space: bool = False, + dp_data: Tuple[defaultdict, defaultdict, defaultdict, int] = None, + min_dp_score_per_symbol: float = -99.9, +) -> str: + """Filter and apply replacements to the input sentence. + Args: + text: input sentence; + replacements: list of proposed replacements (probably intersecting), each is a tuple (begin, end, text, probability); + min_prob: threshold on replacement probability; + replace_hyphen_to_space: if True, hyphens in replacements will be converted to spaces; + dp_data: n-gram mapping vocabularies used by dynamic programming, if None - dynamic programming is not used; + min_dp_score_per_symbol: threshold on dynamic programming sum score averaged by hypothesis length + Returns: + corrected sentence + """ + # sort replacements by positions + replacements.sort() + # filter replacements + # Note that we do not skip replacements with same text, otherwise intersecting candidates with lower probability can win + filtered_replacements = [] + for j in range(len(replacements)): + replacement = replacements[j] + begin, end, candidate, prob = replacement + fragment = text[begin:end] + candidate_spaced = " ".join(list(candidate.replace(" ", "_"))) + fragment_spaced = " ".join(list(fragment.replace(" ", "_"))) + # apply penalty if candidate length is bigger than fragment length + # to avoid cases like "forward-looking" replacing "looking" in "forward looking" resulting in "forward forward looking" + if len(candidate) > len(fragment): + penalty = len(fragment) / len(candidate) + prob *= penalty + # skip replacement with low probability + if prob < min_prob: + continue + # skip replacements with some predefined templates, e.g. "*'s" => "*s" + if check_banned_replacements(fragment, candidate): + continue + if dp_data is not None: + path = get_alignment_by_dp(candidate_spaced, fragment_spaced, dp_data) + # path[-1][3] is the sum of logprobs for best path of dynamic programming: divide sum_score by length + if path[-1][3] / (len(fragment)) < min_dp_score_per_symbol: + continue + + # skip replacement if it intersects with previous replacement and has lower probability, otherwise remove previous replacement + if len(filtered_replacements) > 0 and filtered_replacements[-1][1] > begin: + if filtered_replacements[-1][3] > prob: + continue + else: + filtered_replacements.pop() + filtered_replacements.append((begin, end, candidate, prob)) + + return substitute_replacements_in_text(text, filtered_replacements, replace_hyphen_to_space) + + +def update_manifest_with_spellmapper_corrections( + input_manifest_name: str, + short2full_name: str, + output_manifest_name: str, + spellmapper_results_name: str, + min_prob: float = 0.5, + replace_hyphen_to_space: bool = True, + field_name: str = "pred_text", + use_dp: bool = True, + ngram_mappings: Union[str, None] = None, + min_dp_score_per_symbol: float = -1.5, +) -> None: + """Post-process SpellMapper predictions and write corrected sentence to the specified field of nemo manifest. + The previous content of this field will be copied to "*_before_correction" field. + If the sentence was split into fragments before running SpellMapper, all replacements will be first gathered together and then applied to the original long sentence. + Args: + input_manifest_name: input nemo manifest; + short2full_name: text file with two columns: short_sent \t full_sent; + output_manifest_name: output nemo manifest; + spellmapper_results_name: text file with SpellMapper inference results; + min_prob: threshold on replacement probability; + replace_hyphen_to_space: if True, hyphens in replacements will be converted to spaces; + field_name: name of json field whose text we want to correct; + use_dp: bool = If True, additional replacement filtering will be applied using dynamic programming (works slow); + ngram_mappings: file with n-gram mappings, only needed if use_dp=True + min_dp_score_per_symbol: threshold on dynamic programming sum score averaged by hypothesis length + """ + short2full_sent = defaultdict(list) + sent2corrections = defaultdict(dict) + with open(short2full_name, "r", encoding="utf-8") as f: + for line in f: + s = line.strip() + short_sent, full_sent = s.split("\t") + short2full_sent[short_sent].append(full_sent) + sent2corrections[full_sent] = [] + + spellmapper_results = read_spellmapper_predictions(spellmapper_results_name) + dp_data = None + if use_dp: + dp_data = load_ngram_mappings_for_dp(ngram_mappings) + + for text, replacements, _ in spellmapper_results: + short_sent = text + if short_sent not in short2full_sent: + continue + # it can happen that one short sentence occurred in multiple full sentences + for full_sent in short2full_sent[short_sent]: + offset = full_sent.find(short_sent) + for begin, end, candidate, prob in replacements: + sent2corrections[full_sent].append((begin + offset, end + offset, candidate, prob)) + + out = open(output_manifest_name, "w", encoding="utf-8") + with open(input_manifest_name, "r", encoding="utf-8") as f: + for line in f: + record = json.loads(line.strip()) + sent = record[field_name] + record[field_name + "_before_correction"] = record[field_name] + if sent in sent2corrections: + record[field_name] = apply_replacements_to_text( + sent, + sent2corrections[sent], + min_prob=min_prob, + replace_hyphen_to_space=replace_hyphen_to_space, + dp_data=dp_data, + min_dp_score_per_symbol=min_dp_score_per_symbol, + ) + out.write(json.dumps(record) + "\n") + out.close() + + +def extract_and_split_text_from_manifest( + input_name: str, output_name: str, field_name: str = "pred_text", len_in_words: int = 16, step_in_words: int = 8 +) -> None: + """Extract text of the specified field in nemo manifest and split it into fragments (possibly with intersection). + The result is saved to a text file with two columns: short_sent \t full_sent. + This is useful if we want to process shorter sentences and then apply the results to the original long sentence. + Args: + input_name: input nemo manifest, + output_name: output text file, + field_name: name of json field from which we extract the sentence text, + len_in_words: maximum number of words in a fragment, + step_in_words: on how many words we move at each step. + For example, if the len_in_words=16 and step_in_words=8 the fragments will be intersected by half. + """ + short2full_sent = set() + with open(input_name, "r", encoding="utf-8") as f: + for line in f: + record = json.loads(line.strip()) + sent = record[field_name] + if " " in sent: + raise ValueError("found multiple space in: " + sent) + words = sent.split() + for i in range(0, len(words), step_in_words): + short_sent = " ".join(words[i : i + len_in_words]) + short2full_sent.add((short_sent, sent)) + + with open(output_name, "w", encoding="utf-8") as out: + for short_sent, full_sent in short2full_sent: + out.write(short_sent + "\t" + full_sent + "\n") + + +def check_banned_replacements(src: str, dst: str) -> bool: + """This function is used to check is a pair of words/phrases is matching some common template that we don't want to replace with one another. + Args: + src: first phrase + dst: second phrase + Returns True if this replacement should be banned. + """ + # customers' => customer's + if src.endswith("s'") and dst.endswith("'s") and src[0:-2] == dst[0:-2]: + return True + # customer's => customers' + if src.endswith("'s") and dst.endswith("s'") and src[0:-2] == dst[0:-2]: + return True + # customers => customer's + if src.endswith("s") and dst.endswith("'s") and src[0:-1] == dst[0:-2]: + return True + # customer's => customers + if src.endswith("'s") and dst.endswith("s") and src[0:-2] == dst[0:-1]: + return True + # customers => customers' + if src.endswith("s") and dst.endswith("s'") and src[0:-1] == dst[0:-2]: + return True + # customers' => customers + if src.endswith("s'") and dst.endswith("s") and src[0:-2] == dst[0:-1]: + return True + # utilities => utility's + if src.endswith("ies") and dst.endswith("y's") and src[0:-3] == dst[0:-3]: + return True + # utility's => utilities + if src.endswith("y's") and dst.endswith("ies") and src[0:-3] == dst[0:-3]: + return True + # utilities => utility + if src.endswith("ies") and dst.endswith("y") and src[0:-3] == dst[0:-1]: + return True + # utility => utilities + if src.endswith("y") and dst.endswith("ies") and src[0:-1] == dst[0:-3]: + return True + # group is => group's + if src.endswith(" is") and dst.endswith("'s") and src[0:-3] == dst[0:-2]: + return True + # group's => group is + if src.endswith("'s") and dst.endswith(" is") and src[0:-2] == dst[0:-3]: + return True + # trex's => trex + if src.endswith("'s") and src[0:-2] == dst: + return True + # trex => trex's + if dst.endswith("'s") and dst[0:-2] == src: + return True + # increases => increase (but trimass => trimas is ok) + if src.endswith("s") and (not src.endswith("ss")) and src[0:-1] == dst: + return True + # increase => increases ((but trimas => trimass is ok)) + if dst.endswith("s") and (not dst.endswith("ss")) and dst[0:-1] == src: + return True + # anticipate => anticipated + if src.endswith("e") and dst.endswith("ed") and src[0:-1] == dst[0:-2]: + return True + # anticipated => anticipate + if src.endswith("ed") and dst.endswith("e") and src[0:-2] == dst[0:-1]: + return True + # regarded => regard + if src.endswith("ed") and src[0:-2] == dst: + return True + # regard => regarded + if dst.endswith("ed") and dst[0:-2] == src: + return True + # longer => long + if src.endswith("er") and src[0:-2] == dst: + return True + # long => longer + if dst.endswith("er") and dst[0:-2] == src: + return True + # discussed => discussing + if src.endswith("ed") and dst.endswith("ing") and src[0:-2] == dst[0:-3]: + return True + # discussing => discussed + if src.endswith("ing") and dst.endswith("ed") and src[0:-3] == dst[0:-2]: + return True + # discussion => discussing + if src.endswith("ion") and dst.endswith("ing") and src[0:-3] == dst[0:-3]: + return True + # discussing => discussion + if src.endswith("ing") and dst.endswith("ion") and src[0:-3] == dst[0:-3]: + return True + # dispensers => dispensing + if src.endswith("ers") and dst.endswith("ing") and src[0:-3] == dst[0:-3]: + return True + # dispensing => dispensers + if src.endswith("ing") and dst.endswith("ers") and src[0:-3] == dst[0:-3]: + return True + # discussion => discussed + if src.endswith("ion") and dst.endswith("ed") and src[0:-3] == dst[0:-2]: + return True + # discussed => discussion + if src.endswith("ed") and dst.endswith("ion") and src[0:-2] == dst[0:-3]: + return True + # incremental => increment + if src.endswith("ntal") and dst.endswith("nt") and src[0:-4] == dst[0:-2]: + return True + # increment => incremental + if src.endswith("nt") and dst.endswith("ntal") and src[0:-2] == dst[0:-4]: + return True + # delivery => deliverer + if src.endswith("ery") and dst.endswith("erer") and src[0:-3] == dst[0:-4]: + return True + # deliverer => delivery + if src.endswith("erer") and dst.endswith("ery") and src[0:-4] == dst[0:-3]: + return True + # comparably => comparable + if src.endswith("bly") and dst.endswith("ble") and src[0:-3] == dst[0:-3]: + return True + # comparable => comparably + if src.endswith("ble") and dst.endswith("bly") and src[0:-3] == dst[0:-3]: + return True + # beautiful => beautifully + if src.endswith("l") and dst.endswith("lly") and src[0:-1] == dst[0:-3]: + return True + # beautifully => beautiful + if src.endswith("lly") and dst.endswith("l") and src[0:-3] == dst[0:-1]: + return True + # america => american + if src.endswith("a") and dst.endswith("an") and src[0:-1] == dst[0:-2]: + return True + # american => america + if src.endswith("an") and dst.endswith("a") and src[0:-2] == dst[0:-1]: + return True + # reinvesting => investing + if src.startswith("re") and src[2:] == dst: + return True + # investing => reinvesting + if dst.startswith("re") and dst[2:] == src: + return True + # outperformance => performance + if src.startswith("out") and src[3:] == dst: + return True + # performance => outperformance + if dst.startswith("out") and dst[3:] == src: + return True + return False diff --git a/nemo/collections/nlp/data/text_normalization_as_tagging/utils.py b/nemo/collections/nlp/data/text_normalization_as_tagging/utils.py index 253f7a41c703..9d5f5b7b23ad 100644 --- a/nemo/collections/nlp/data/text_normalization_as_tagging/utils.py +++ b/nemo/collections/nlp/data/text_normalization_as_tagging/utils.py @@ -17,6 +17,8 @@ from itertools import groupby from typing import Dict, List, Tuple +import numpy as np + """Utility functions for Thutmose Tagger.""" @@ -305,3 +307,197 @@ def get_src_and_dst_for_alignment( ) return written_str, spoken, " ".join(same_begin), " ".join(same_end) + + +def fill_alignment_matrix( + fline2: str, fline3: str, gline2: str, gline3: str +) -> Tuple[np.ndarray, List[str], List[str]]: + """Parse Giza++ direct and reverse alignment results and represent them as an alignment matrix + + Args: + fline2: e.g. "_2 0 1 4_" + fline3: e.g. "NULL ({ }) twenty ({ 1 }) fourteen ({ 2 3 4 })" + gline2: e.g. "twenty fourteen" + gline3: e.g. "NULL ({ }) _2 ({ 1 }) 0 ({ }) 1 ({ }) 4_ ({ 2 })" + + Returns: + matrix: a numpy array of shape (src_len, dst_len) filled with [0, 1, 2, 3], where 3 means a reliable alignment + the corresponding words were aligned to one another in direct and reverse alignment runs, 1 and 2 mean that the + words were aligned only in one direction, 0 - no alignment. + srctokens: e.g. ["twenty", "fourteen"] + dsttokens: e.g. ["_2", "0", "1", "4_"] + + For example, the alignment matrix for the above example may look like: + [[3, 0, 0, 0] + [0, 2, 2, 3]] + """ + if fline2 is None or gline2 is None or fline3 is None or gline3 is None: + raise ValueError(f"empty params") + srctokens = gline2.split() + dsttokens = fline2.split() + pattern = r"([^ ]+) \(\{ ([^\(\{\}\)]*) \}\)" + src2dst = re.findall(pattern, fline3.replace("({ })", "({ })")) + dst2src = re.findall(pattern, gline3.replace("({ })", "({ })")) + if len(src2dst) != len(srctokens) + 1: + raise ValueError( + "length mismatch: len(src2dst)=" + + str(len(src2dst)) + + "; len(srctokens)" + + str(len(srctokens)) + + "\n" + + gline2 + + "\n" + + fline3 + ) + if len(dst2src) != len(dsttokens) + 1: + raise ValueError( + "length mismatch: len(dst2src)=" + + str(len(dst2src)) + + "; len(dsttokens)" + + str(len(dsttokens)) + + "\n" + + fline2 + + "\n" + + gline3 + ) + matrix = np.zeros((len(srctokens), len(dsttokens))) + for i in range(1, len(src2dst)): + token, to_str = src2dst[i] + if to_str == "": + continue + to = list(map(int, to_str.split())) + for t in to: + matrix[i - 1][t - 1] = 2 + + for i in range(1, len(dst2src)): + token, to_str = dst2src[i] + if to_str == "": + continue + to = list(map(int, to_str.split())) + for t in to: + matrix[t - 1][i - 1] += 1 + + return matrix, srctokens, dsttokens + + +def check_monotonicity(matrix: np.ndarray) -> bool: + """Check if alignment is monotonous - i.e. the relative order is preserved (no swaps). + + Args: + matrix: a numpy array of shape (src_len, dst_len) filled with [0, 1, 2, 3], where 3 means a reliable alignment + the corresponding words were aligned to one another in direct and reverse alignment runs, 1 and 2 mean that the + words were aligned only in one direction, 0 - no alignment. + """ + is_sorted = lambda k: np.all(k[:-1] <= k[1:]) + + a = np.argwhere(matrix == 3) + b = np.argwhere(matrix == 2) + c = np.vstack((a, b)) + d = c[c[:, 1].argsort()] # sort by second column (less important) + d = d[d[:, 0].argsort(kind="mergesort")] + return is_sorted(d[:, 1]) + + +def get_targets(matrix: np.ndarray, dsttokens: List[str], delimiter: str) -> List[str]: + """Join some of the destination tokens, so that their number becomes the same as the number of input words. + Unaligned tokens tend to join to the left aligned token. + + Args: + matrix: a numpy array of shape (src_len, dst_len) filled with [0, 1, 2, 3], where 3 means a reliable alignment + the corresponding words were aligned to one another in direct and reverse alignment runs, 1 and 2 mean that the + words were aligned only in one direction, 0 - no alignment. + dsttokens: e.g. ["_2", "0", "1", "4_"] + Returns: + targets: list of string tokens, with one-to-one correspondence to matrix.shape[0] + + Example: + If we get + matrix=[[3, 0, 0, 0] + [0, 2, 2, 3]] + dsttokens=["_2", "0", "1", "4_"] + it gives + targets = ["_201", "4_"] + Actually, this is a mistake instead of ["_20", "14_"]. That will be further corrected by regular expressions. + """ + targets = [] + last_covered_dst_id = -1 + for i in range(len(matrix)): + dstlist = [] + for j in range(last_covered_dst_id + 1, len(dsttokens)): + # matrix[i][j] == 3: safe alignment point + if matrix[i][j] == 3 or ( + j == last_covered_dst_id + 1 + and np.all(matrix[i, :] == 0) # if the whole line does not have safe points + and np.all(matrix[:, j] == 0) # and the whole column does not have safe points, match them + ): + if len(targets) == 0: # if this is first safe point, attach left unaligned columns to it, if any + for k in range(0, j): + if np.all(matrix[:, k] == 0): # if column k does not have safe points + dstlist.append(dsttokens[k]) + else: + break + dstlist.append(dsttokens[j]) + last_covered_dst_id = j + for k in range(j + 1, len(dsttokens)): + if np.all(matrix[:, k] == 0): # if column k does not have safe points + dstlist.append(dsttokens[k]) + last_covered_dst_id = k + else: + break + + if len(dstlist) > 0: + targets.append(delimiter.join(dstlist)) + else: + targets.append("") + return targets + + +def get_targets_from_back(matrix: np.ndarray, dsttokens: List[str], delimiter: str) -> List[str]: + """Join some of the destination tokens, so that their number becomes the same as the number of input words. + Unaligned tokens tend to join to the right aligned token. + + Args: + matrix: a numpy array of shape (src_len, dst_len) filled with [0, 1, 2, 3], where 3 means a reliable alignment + the corresponding words were aligned to one another in direct and reverse alignment runs, 1 and 2 mean that the + words were aligned only in one direction, 0 - no alignment. + dsttokens: e.g. ["_2", "0", "1", "4_"] + Returns: + targets: list of string tokens, with one-to-one correspondence to matrix.shape[0] + + Example: + If we get + matrix=[[3, 0, 0, 0] + [0, 2, 2, 3]] + dsttokens=["_2", "0", "1", "4_"] + it gives + targets = ["_2", "014_"] + Actually, this is a mistake instead of ["_20", "14_"]. That will be further corrected by regular expressions. + """ + + targets = [] + last_covered_dst_id = len(dsttokens) + for i in range(len(matrix) - 1, -1, -1): + dstlist = [] + for j in range(last_covered_dst_id - 1, -1, -1): + if matrix[i][j] == 3 or ( + j == last_covered_dst_id - 1 and np.all(matrix[i, :] == 0) and np.all(matrix[:, j] == 0) + ): + if len(targets) == 0: + for k in range(len(dsttokens) - 1, j, -1): + if np.all(matrix[:, k] == 0): + dstlist.append(dsttokens[k]) + else: + break + dstlist.append(dsttokens[j]) + last_covered_dst_id = j + for k in range(j - 1, -1, -1): + if np.all(matrix[:, k] == 0): + dstlist.append(dsttokens[k]) + last_covered_dst_id = k + else: + break + if len(dstlist) > 0: + targets.append(delimiter.join(list(reversed(dstlist)))) + else: + targets.append("") + return list(reversed(targets)) diff --git a/nemo/collections/nlp/models/__init__.py b/nemo/collections/nlp/models/__init__.py index 90e692a238a6..75b48f64df13 100644 --- a/nemo/collections/nlp/models/__init__.py +++ b/nemo/collections/nlp/models/__init__.py @@ -30,6 +30,7 @@ from nemo.collections.nlp.models.language_modeling.transformer_lm_model import TransformerLMModel from nemo.collections.nlp.models.machine_translation import MTEncDecModel from nemo.collections.nlp.models.question_answering.qa_model import QAModel +from nemo.collections.nlp.models.spellchecking_asr_customization import SpellcheckingAsrCustomizationModel from nemo.collections.nlp.models.text2sparql.text2sparql_model import Text2SparqlModel from nemo.collections.nlp.models.text_classification import TextClassificationModel from nemo.collections.nlp.models.text_normalization_as_tagging import ThutmoseTaggerModel diff --git a/nemo/collections/nlp/models/spellchecking_asr_customization/__init__.py b/nemo/collections/nlp/models/spellchecking_asr_customization/__init__.py new file mode 100644 index 000000000000..5e94de32e9aa --- /dev/null +++ b/nemo/collections/nlp/models/spellchecking_asr_customization/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from nemo.collections.nlp.models.spellchecking_asr_customization.spellchecking_model import ( + SpellcheckingAsrCustomizationModel, +) diff --git a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py new file mode 100644 index 000000000000..fc889de2dc63 --- /dev/null +++ b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py @@ -0,0 +1,526 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from time import perf_counter +from typing import Dict, Optional + +import torch +from omegaconf import DictConfig +from pytorch_lightning import Trainer + +from nemo.collections.common.losses import CrossEntropyLoss +from nemo.collections.nlp.data.spellchecking_asr_customization import ( + SpellcheckingAsrCustomizationDataset, + SpellcheckingAsrCustomizationTestDataset, + TarredSpellcheckingAsrCustomizationDataset, + bert_example, +) +from nemo.collections.nlp.data.text_normalization_as_tagging.utils import read_label_map +from nemo.collections.nlp.metrics.classification_report import ClassificationReport +from nemo.collections.nlp.models.nlp_model import NLPModel +from nemo.collections.nlp.modules.common.token_classifier import TokenClassifier +from nemo.collections.nlp.parts.utils_funcs import tensor2list +from nemo.core.classes.common import PretrainedModelInfo, typecheck +from nemo.core.neural_types import LogitsType, NeuralType +from nemo.utils import logging +from nemo.utils.decorators import experimental + +__all__ = ["SpellcheckingAsrCustomizationModel"] + + +@experimental +class SpellcheckingAsrCustomizationModel(NLPModel): + """ + BERT-based model for Spellchecking ASR Customization. + It takes as input ASR hypothesis and candidate customization entries. + It labels the hypothesis with correct entry index or 0. + Example input: [CLS] a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o [SEP] d i d i e r _ s a u m o n [SEP] a s t r o n o m i e [SEP] t r i s t a n _ g u i l l o t [SEP] ... + Input segments: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 + Example output: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 0 ... + """ + + @property + def output_types(self) -> Optional[Dict[str, NeuralType]]: + return { + "logits": NeuralType(('B', 'T', 'D'), LogitsType()), + } + + @property + def input_module(self): + return self + + @property + def output_module(self): + return self + + def __init__(self, cfg: DictConfig, trainer: Trainer = None) -> None: + super().__init__(cfg=cfg, trainer=trainer) + + # Label map contains 11 labels: 0 for nothing, 1..10 for target candidate ids + label_map_file = self.register_artifact("label_map", cfg.label_map, verify_src_exists=True) + + # Semiotic classes for this model consist only of classes CUSTOM(means fragment containing custom candidate) and PLAIN (any other single-character fragment) + # They are used only during validation step, to calculate accuracy for CUSTOM and PLAIN classes separately + semiotic_classes_file = self.register_artifact( + "semiotic_classes", cfg.semiotic_classes, verify_src_exists=True + ) + self.label_map = read_label_map(label_map_file) + self.semiotic_classes = read_label_map(semiotic_classes_file) + + self.num_labels = len(self.label_map) + self.num_semiotic_labels = len(self.semiotic_classes) + self.id_2_tag = {tag_id: tag for tag, tag_id in self.label_map.items()} + self.id_2_semiotic = {semiotic_id: semiotic for semiotic, semiotic_id in self.semiotic_classes.items()} + self.max_sequence_len = cfg.get('max_sequence_len', self.tokenizer.tokenizer.model_max_length) + + # Setup to track metrics + # We will have (len(self.semiotic_classes) + 1) labels. + # Last one stands for WRONG (span in which the predicted tags don't match the labels) + # This is needed to feed the sequence of classes to classification_report during validation + label_ids = self.semiotic_classes.copy() + label_ids["WRONG"] = len(self.semiotic_classes) + self.tag_classification_report = ClassificationReport( + len(self.semiotic_classes) + 1, label_ids=label_ids, mode='micro', dist_sync_on_step=True + ) + + self.hidden_size = cfg.hidden_size + + # hidden size is doubled because in forward we concatenate embeddings for characters and embeddings for subwords + self.logits = TokenClassifier( + self.hidden_size * 2, num_classes=self.num_labels, num_layers=1, log_softmax=False, dropout=0.1 + ) + + self.loss_fn = CrossEntropyLoss(logits_ndim=3) + + self.builder = bert_example.BertExampleBuilder( + self.label_map, self.semiotic_classes, self.tokenizer.tokenizer, self.max_sequence_len + ) + + @typecheck() + def forward( + self, + input_ids, + input_mask, + segment_ids, + input_ids_for_subwords, + input_mask_for_subwords, + segment_ids_for_subwords, + character_pos_to_subword_pos, + ): + """ + Same BERT-based model is used to calculate embeddings for sequence of single characters and for sequence of subwords. + Then we concatenate subword embeddings to each character corresponding to this subword. + We return logits for each character x 11 labels: 0 - character doesn't belong to any candidate, 1..10 - character belongs to candidate with this id. + + # Arguments + input_ids: token_ids for single characters; .shape = [batch_size, char_seq_len]; .dtype = int64 + input_mask: mask for input_ids(1 - real, 0 - padding); .shape = [batch_size, char_seq_len]; .dtype = int64 + segment_ids: segment types for input_ids (0 - ASR-hypothesis, 1..10 - candidate); .shape = [batch_size, char_seq_len]; .dtype = int64 + input_ids_for_subwords: token_ids for subwords; .shape = [batch_size, subword_seq_len]; .dtype = int64 + input_mask_for_subwords: mask for input_ids_for_subwords(1 - real, 0 - padding); .shape = [batch_size, subword_seq_len]; .dtype = int64 + segment_ids_for_subwords: segment types for input_ids_for_subwords (0 - ASR-hypothesis, 1..10 - candidate); .shape = [batch_size, subword_seq_len]; .dtype = int64 + character_pos_to_subword_pos: tensor mapping character position in the input sequence to subword position; .shape = [batch_size, char_seq_len]; .dtype = int64 + """ + + # src_hiddens.shape = [batch_size, char_seq_len, bert_hidden_size]; .dtype=float32 + src_hiddens = self.bert_model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) + # src_hiddens_for_subwords.shape = [batch_size, subword_seq_len, bert_hidden_size]; .dtype=float32 + src_hiddens_for_subwords = self.bert_model( + input_ids=input_ids_for_subwords, + token_type_ids=segment_ids_for_subwords, + attention_mask=input_mask_for_subwords, + ) + + # Next three commands concatenate subword embeddings to each character embedding of the corresponding subword + # index.shape = [batch_size, char_seq_len, bert_hidden_size]; .dtype=int64 + index = character_pos_to_subword_pos.unsqueeze(-1).expand((-1, -1, src_hiddens_for_subwords.shape[2])) + # src_hiddens_2.shape = [batch_size, char_seq_len, bert_hidden_size]; .dtype=float32 + src_hiddens_2 = torch.gather(src_hiddens_for_subwords, 1, index) + # src_hiddens.shape = [batch_size, char_seq_len, bert_hidden_size * 2]; .dtype=float32 + src_hiddens = torch.cat((src_hiddens, src_hiddens_2), 2) + + # logits.shape = [batch_size, char_seq_len, num_labels]; num_labels=11: ids from 0 to 10; .dtype=float32 + logits = self.logits(hidden_states=src_hiddens) + return logits + + # Training + def training_step(self, batch, batch_idx): + """ + Lightning calls this inside the training loop with the data from the training dataloader + passed in as `batch`. + """ + + ( + input_ids, + input_mask, + segment_ids, + input_ids_for_subwords, + input_mask_for_subwords, + segment_ids_for_subwords, + character_pos_to_subword_pos, + labels_mask, + labels, + _, + ) = batch + logits = self.forward( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + input_ids_for_subwords=input_ids_for_subwords, + input_mask_for_subwords=input_mask_for_subwords, + segment_ids_for_subwords=segment_ids_for_subwords, + character_pos_to_subword_pos=character_pos_to_subword_pos, + ) + loss = self.loss_fn(logits=logits, labels=labels, loss_mask=labels_mask) + lr = self._optimizer.param_groups[0]['lr'] + self.log('train_loss', loss) + self.log('lr', lr, prog_bar=True) + return {'loss': loss, 'lr': lr} + + # Validation and Testing + def validation_step(self, batch, batch_idx): + """ + Lightning calls this inside the validation loop with the data from the validation dataloader + passed in as `batch`. + """ + ( + input_ids, + input_mask, + segment_ids, + input_ids_for_subwords, + input_mask_for_subwords, + segment_ids_for_subwords, + character_pos_to_subword_pos, + labels_mask, + labels, + spans, + ) = batch + logits = self.forward( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + input_ids_for_subwords=input_ids_for_subwords, + input_mask_for_subwords=input_mask_for_subwords, + segment_ids_for_subwords=segment_ids_for_subwords, + character_pos_to_subword_pos=character_pos_to_subword_pos, + ) + tag_preds = torch.argmax(logits, dim=2) + + # Update tag classification_report + for input_mask_seq, segment_seq, prediction_seq, label_seq, span_seq in zip( + input_mask.tolist(), segment_ids.tolist(), tag_preds.tolist(), labels.tolist(), spans.tolist() + ): + # Here we want to track whether the predicted output matches ground truth labels for each whole span. + # We construct the special input for classification report, for example: + # span_labels = [PLAIN, PLAIN, PLAIN, PLAIN, CUSTOM, CUSTOM] + # span_predictions = [PLAIN, WRONG, PLAIN, PLAIN, WRONG, CUSTOM] + # Note that the number of PLAIN and CUSTOM occurrences in the report is not comparable, + # because PLAIN is for characters, and CUSTOM is for phrases. + span_labels = [] + span_predictions = [] + plain_cid = self.semiotic_classes["PLAIN"] + wrong_cid = self.tag_classification_report.num_classes - 1 + + # First we loop through all predictions for input characters with label=0, they are regarded as separate spans with PLAIN class. + # It either stays as PLAIN if the model prediction is 0, or turns to WRONG. + for i in range(len(segment_seq)): + if input_mask_seq[i] == 0: + continue + if segment_seq[i] > 0: # token does not belong to ASR-hypothesis => it's over + break + if label_seq[i] == 0: + span_labels.append(plain_cid) + if prediction_seq[i] == 0: + span_predictions.append(plain_cid) + else: + span_predictions.append(wrong_cid) + # if label_seq[i] != 0 then it belongs to CUSTOM span and will be handled later + + # Second we loop through spans tensor which contains only spans for CUSTOM class. + # It stays as CUSTOM if all predictions for the whole span are equal to the labels, otherwise it turns to WRONG. + for cid, start, end in span_seq: + if cid == -1: + break + span_labels.append(cid) + if prediction_seq[start:end] == label_seq[start:end]: + span_predictions.append(cid) + else: + span_predictions.append(wrong_cid) + + if len(span_labels) != len(span_predictions): + raise ValueError( + "Length mismatch: len(span_labels)=" + + str(len(span_labels)) + + "; len(span_predictions)=" + + str(len(span_predictions)) + ) + self.tag_classification_report( + torch.tensor(span_predictions).to(self.device), torch.tensor(span_labels).to(self.device) + ) + + val_loss = self.loss_fn(logits=logits, labels=labels, loss_mask=labels_mask) + return {'val_loss': val_loss} + + def validation_epoch_end(self, outputs): + """ + Called at the end of validation to aggregate outputs. + :param outputs: list of individual outputs of each validation step. + """ + avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() + + # Calculate metrics and classification report + # Note that in our task recall = accuracy, and the recall column is the per class accuracy + _, tag_accuracy, _, tag_report = self.tag_classification_report.compute() + + logging.info("Total tag accuracy: " + str(tag_accuracy)) + logging.info(tag_report) + + self.log('val_loss', avg_loss, prog_bar=True) + self.log('tag accuracy', tag_accuracy) + + self.tag_classification_report.reset() + + def test_step(self, batch, batch_idx): + """ + Lightning calls this inside the test loop with the data from the test dataloader + passed in as `batch`. + """ + return self.validation_step(batch, batch_idx) + + def test_epoch_end(self, outputs): + """ + Called at the end of test to aggregate outputs. + :param outputs: list of individual outputs of each test step. + """ + return self.validation_epoch_end(outputs) + + # Functions for inference + + @torch.no_grad() + def infer(self, dataloader_cfg: DictConfig, input_name: str, output_name: str) -> None: + """ Main function for Inference + + Args: + dataloader_cfg: config for dataloader + input_name: Input file with tab-separated text records. Each record consists of 2 items: + - ASR hypothesis + - candidate phrases separated by semicolon + output_name: Output file with tab-separated text records. Each record consists of 2 items: + - ASR hypothesis + - candidate phrases separated by semicolon + - list of possible replacements with probabilities (start, pos, candidate_id, prob), separated by semicolon + - list of labels, predicted for each letter (for debug purposes) + + Returns: None + """ + mode = self.training + device = "cuda" if torch.cuda.is_available() else "cpu" + + try: + # Switch model to evaluation mode + self.eval() + self.to(device) + logging_level = logging.get_verbosity() + logging.set_verbosity(logging.WARNING) + infer_datalayer = self._setup_infer_dataloader(dataloader_cfg, input_name) + + all_tag_preds = ( + [] + ) # list(size=number of sentences) of lists(size=number of letters) of tag predictions (best candidate_id for each letter) + all_possible_replacements = ( + [] + ) # list(size=number of sentences) of lists(size=number of potential replacements) of tuples(start, pos, candidate_id, prob) + for batch in iter(infer_datalayer): + ( + input_ids, + input_mask, + segment_ids, + input_ids_for_subwords, + input_mask_for_subwords, + segment_ids_for_subwords, + character_pos_to_subword_pos, + fragment_indices, + ) = batch + + # tag_logits.shape = [batch_size, char_seq_len, num_labels]; num_labels=11: ids from 0 to 10; .dtype=float32 + tag_logits = self.forward( + input_ids=input_ids.to(self.device), + input_mask=input_mask.to(self.device), + segment_ids=segment_ids.to(self.device), + input_ids_for_subwords=input_ids_for_subwords.to(self.device), + input_mask_for_subwords=input_mask_for_subwords.to(self.device), + segment_ids_for_subwords=segment_ids_for_subwords.to(self.device), + character_pos_to_subword_pos=character_pos_to_subword_pos.to(self.device), + ) + + # fragment_indices.shape=[batsh_size, num_fragments, 3], where last dimension is [start, end, label], where label is candidate id from 1 to 10 + # Next we want to convert predictions for separate letters to probabilities for each whole fragment from fragment_indices. + # To achieve this we first sum the letter logits in each fragment and divide by its length. + # (We use .cumsum and then difference between end and start to get sum per fragment). + # Then we convert logits to probs with softmax and for each fragment extract only the prob for given label. + # Finally we get a list of tuples (start, end, label, prob) + indices_len = fragment_indices.shape[1] + # this padding adds a row of zeros (size=num_labels) as first element of sequence in second dimension. This is needed for cumsum operations. + padded_logits = torch.nn.functional.pad(tag_logits, pad=(0, 0, 1, 0)) + ( + batch_size, + seq_len, + num_labels, + ) = padded_logits.shape # seq_len is +1 compared to that of tag_logits, because of padding + # cumsum.shape=[batch_size, seq_len, num_labels] + cumsum = padded_logits.cumsum(dim=1) + # the size -1 is inferred from other dimensions. We get rid of batch dimension. + cumsum_view = cumsum.view(-1, num_labels) + word_index = ( + torch.ones((batch_size, indices_len), dtype=torch.long) + * torch.arange(batch_size).reshape((-1, 1)) + * seq_len + ).view(-1) + lower_index = (fragment_indices[..., 0]).view(-1) + word_index + higher_index = (fragment_indices[..., 1]).view(-1) + word_index + d_index = (higher_index - lower_index).reshape((-1, 1)).to(self.device) # word lengths + dlog = cumsum_view[higher_index, :] - cumsum_view[lower_index, :] # sum of logits + # word_logits.shape=[batch_size, indices_len, num_labels] + word_logits = (dlog / d_index.float()).view(batch_size, indices_len, num_labels) + # convert logits to probs, same shape + word_probs = torch.nn.functional.softmax(word_logits, dim=-1).to(self.device) + # candidate_index.shape=[batch_size, indices_len] + candidate_index = fragment_indices[:, :, 2].to(self.device) + # candidate_probs.shape=[batch_size, indices_len] + candidate_probs = torch.take_along_dim(word_probs, candidate_index.unsqueeze(2), dim=-1).squeeze(2) + for i in range(batch_size): + possible_replacements = [] + for j in range(indices_len): + start, end, candidate_id = ( + int(fragment_indices[i][j][0]), + int(fragment_indices[i][j][1]), + int(fragment_indices[i][j][2]), + ) + if candidate_id == 0: # this is padding + continue + prob = round(float(candidate_probs[i][j]), 5) + if prob < 0.01: + continue + # -1 because in the output file we will not have a [CLS] token + possible_replacements.append( + str(start - 1) + " " + str(end - 1) + " " + str(candidate_id) + " " + str(prob) + ) + all_possible_replacements.append(possible_replacements) + + # torch.argmax(tag_logits, dim=-1) gives a tensor of best predicted labels with shape [batch_size, char_seq_len], .dtype = int64 + # character_preds is list of lists of predicted labels + character_preds = tensor2list(torch.argmax(tag_logits, dim=-1)) + all_tag_preds.extend(character_preds) + + if len(all_possible_replacements) != len(all_tag_preds) or len(all_possible_replacements) != len( + infer_datalayer.dataset.examples + ): + raise IndexError( + "number of sentences mismatch: len(all_possible_replacements)=" + + str(len(all_possible_replacements)) + + "; len(all_tag_preds)=" + + str(len(all_tag_preds)) + + "; len(infer_datalayer.dataset.examples)=" + + str(len(infer_datalayer.dataset.examples)) + ) + # save results to file + with open(output_name, "w", encoding="utf-8") as out: + for i in range(len(infer_datalayer.dataset.examples)): + hyp, ref = infer_datalayer.dataset.hyps_refs[i] + num_letters = hyp.count(" ") + 1 + tag_pred_str = " ".join(list(map(str, all_tag_preds[i][1 : (num_letters + 1)]))) + possible_replacements_str = ";".join(all_possible_replacements[i]) + out.write(hyp + "\t" + ref + "\t" + possible_replacements_str + "\t" + tag_pred_str + "\n") + + except Exception as e: + raise ValueError("Error processing file " + input_name) + + finally: + # set mode back to its original value + self.train(mode=mode) + logging.set_verbosity(logging_level) + + # Functions for processing data + def setup_training_data(self, train_data_config: Optional[DictConfig]): + if not train_data_config or not train_data_config.data_path: + logging.info( + f"Dataloader config or file_path for the train is missing, so no data loader for train is created!" + ) + self._train_dl = None + return + self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, data_split="train") + + def setup_validation_data(self, val_data_config: Optional[DictConfig]): + if not val_data_config or not val_data_config.data_path: + logging.info( + f"Dataloader config or file_path for the validation is missing, so no data loader for validation is created!" + ) + self._validation_dl = None + return + self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config, data_split="val") + + def setup_test_data(self, test_data_config: Optional[DictConfig]): + if not test_data_config or test_data_config.data_path is None: + logging.info( + f"Dataloader config or file_path for the test is missing, so no data loader for test is created!" + ) + self._test_dl = None + return + self._test_dl = self._setup_dataloader_from_config(cfg=test_data_config, data_split="test") + + def _setup_dataloader_from_config(self, cfg: DictConfig, data_split: str): + start_time = perf_counter() + logging.info(f'Creating {data_split} dataset') + if cfg.get("use_tarred_dataset", False): + dataset = TarredSpellcheckingAsrCustomizationDataset( + cfg.data_path, + shuffle_n=cfg.get("tar_shuffle_n", 100), + global_rank=self.global_rank, + world_size=self.world_size, + pad_token_id=self.builder._pad_id, + ) + else: + input_file = cfg.data_path + dataset = SpellcheckingAsrCustomizationDataset(input_file=input_file, example_builder=self.builder) + dl = torch.utils.data.DataLoader( + dataset=dataset, batch_size=cfg.batch_size, shuffle=cfg.shuffle, collate_fn=dataset.collate_fn + ) + running_time = perf_counter() - start_time + logging.info(f'Took {running_time} seconds') + return dl + + def _setup_infer_dataloader(self, cfg: DictConfig, input_name: str) -> 'torch.utils.data.DataLoader': + """ + Setup function for a infer data loader. + Args: + cfg: config dictionary containing data loader params like batch_size, num_workers and pin_memory + input_name: path to input file. + Returns: + A pytorch DataLoader. + """ + dataset = SpellcheckingAsrCustomizationTestDataset(input_name, example_builder=self.builder) + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=cfg["batch_size"], + shuffle=False, + num_workers=cfg.get("num_workers", 0), + pin_memory=cfg.get("pin_memory", False), + drop_last=False, + collate_fn=dataset.collate_fn, + ) + + @classmethod + def list_available_models(cls) -> Optional[PretrainedModelInfo]: + return None diff --git a/scripts/dataset_processing/spoken_wikipedia/run.sh b/scripts/dataset_processing/spoken_wikipedia/run.sh index 2894eb1dc55e..5ae447c9a1a4 100644 --- a/scripts/dataset_processing/spoken_wikipedia/run.sh +++ b/scripts/dataset_processing/spoken_wikipedia/run.sh @@ -102,7 +102,7 @@ ${NEMO_PATH}/tools/ctc_segmentation/run_segmentation.sh \ --MODEL_NAME_OR_PATH=${MODEL_FOR_SEGMENTATION} \ --DATA_DIR=${INPUT_DIR}_prepared \ --OUTPUT_DIR=${OUTPUT_DIR} \ ---MIN_SCORE=${MIN_SCORE} +--MIN_SCORE=${THRESHOLD} # Thresholds for filtering CER_THRESHOLD=20 diff --git a/tests/collections/nlp/test_spellchecking_asr_customization.py b/tests/collections/nlp/test_spellchecking_asr_customization.py new file mode 100644 index 000000000000..8e4d6e9a7b8f --- /dev/null +++ b/tests/collections/nlp/test_spellchecking_asr_customization.py @@ -0,0 +1,1102 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from transformers import AutoTokenizer + +from nemo.collections.nlp.data.spellchecking_asr_customization.bert_example import BertExampleBuilder +from nemo.collections.nlp.data.spellchecking_asr_customization.utils import ( + apply_replacements_to_text, + substitute_replacements_in_text, +) + + +@pytest.mark.unit +def test_substitute_replacements_in_text(): + text = "we began the further diversification of our revenue base with the protterra supply agreement and the navastar joint development agreement" + replacements = [(66, 75, 'pro-terra', 0.99986), (101, 109, 'navistar', 0.996)] + gold_text = "we began the further diversification of our revenue base with the pro-terra supply agreement and the navistar joint development agreement" + corrected_text = substitute_replacements_in_text(text, replacements, replace_hyphen_to_space=False) + assert corrected_text == gold_text + + gold_text_no_hyphen = "we began the further diversification of our revenue base with the pro terra supply agreement and the navistar joint development agreement" + corrected_text = substitute_replacements_in_text(text, replacements, replace_hyphen_to_space=True) + assert corrected_text == gold_text_no_hyphen + + +@pytest.mark.unit +def test_apply_replacements_to_text(): + + # min_prob = 0.5 + # dp_data = None, + # min_dp_score_per_symbol: float = -99.9 + + # test more than one fragment to replace, test multiple same replacements + text = "we began the further diversification of our revenue base with the protterra supply agreement and the navastar joint development agreement" + replacements = [ + (66, 75, 'proterra', 0.99986), + (66, 75, 'proterra', 0.9956), + (101, 109, 'navistar', 0.93), + (101, 109, 'navistar', 0.91), + (101, 109, 'navistar', 0.92), + ] + gold_text = "we began the further diversification of our revenue base with the proterra supply agreement and the navistar joint development agreement" + corrected_text = apply_replacements_to_text( + text, replacements, min_prob=0.5, replace_hyphen_to_space=False, dp_data=None + ) + assert corrected_text == gold_text + + # test that min_prob works + gold_text = "we began the further diversification of our revenue base with the proterra supply agreement and the navastar joint development agreement" + corrected_text = apply_replacements_to_text( + text, replacements, min_prob=0.95, replace_hyphen_to_space=False, dp_data=None + ) + assert corrected_text == gold_text + + +@pytest.fixture() +def bert_example_builder(): + tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_6L_768D") + label_map = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10} + semiotic_classes = {"PLAIN": 0, "CUSTOM": 1} + max_seq_len = 256 + builder = BertExampleBuilder(label_map, semiotic_classes, tokenizer, max_seq_len) + return builder + + +@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") +@pytest.mark.with_downloads +@pytest.mark.unit +def test_creation(bert_example_builder): + assert bert_example_builder._tokenizer is not None + + +@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") +@pytest.mark.with_downloads +@pytest.mark.unit +def test_builder_get_spans(bert_example_builder): + span_info_parts = ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"] + gold_sorted_spans = [(1, 1, 8), (1, 38, 42), (1, 43, 47), (1, 48, 53)] + spans = bert_example_builder._get_spans(span_info_parts) + spans.sort() + assert spans == gold_sorted_spans + + +@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") +@pytest.mark.with_downloads +@pytest.mark.unit +def test_builder_get_fragment_indices(bert_example_builder): + hyp = "a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w" + targets = [1] + # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w + # 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 + span_info_parts = ["CUSTOM 8 17"] + gold_sorted_fragment_indices = [(7, 18, 1), (11, 18, 1)] + fragment_indices = bert_example_builder._get_fragment_indices(hyp, targets, span_info_parts) + fragment_indices.sort() + assert fragment_indices == gold_sorted_fragment_indices + + # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w + # 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + span_info_parts = ["CUSTOM 10 16"] + gold_sorted_fragment_indices = [(11, 18, 1)] + fragment_indices = bert_example_builder._get_fragment_indices(hyp, targets, span_info_parts) + fragment_indices.sort() + assert fragment_indices == gold_sorted_fragment_indices + + +@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") +@pytest.mark.with_downloads +@pytest.mark.unit +def test_builder_get_input_features(bert_example_builder): + hyp = "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" + ref = "d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y" + targets = [1, 3] + span_info_parts = ["CUSTOM 12 23", "CUSTOM 28 41"] + + gold_tags = [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + ] + gold_input_ids = [ + 101, + 1037, + 1055, + 1056, + 1054, + 1051, + 1050, + 1051, + 1049, + 1041, + 1054, + 1055, + 1035, + 1040, + 1045, + 1040, + 1045, + 1041, + 1035, + 1055, + 1051, + 1049, + 1051, + 1050, + 1035, + 1037, + 1050, + 1040, + 1035, + 1056, + 1054, + 1045, + 1055, + 1056, + 1045, + 1037, + 1050, + 1035, + 1043, + 1048, + 1048, + 1051, + 102, + 1040, + 1045, + 1040, + 1045, + 1041, + 1054, + 1035, + 1055, + 1037, + 1057, + 1049, + 1051, + 1050, + 102, + 1037, + 1055, + 1056, + 1054, + 1051, + 1050, + 1051, + 1049, + 1045, + 1041, + 102, + 1056, + 1054, + 1045, + 1055, + 1056, + 1037, + 1050, + 1035, + 1043, + 1057, + 1045, + 1048, + 1048, + 1051, + 1056, + 102, + 1056, + 1054, + 1045, + 1055, + 1056, + 1041, + 1055, + 1055, + 1041, + 102, + 1049, + 1051, + 1050, + 1037, + 1040, + 1041, + 102, + 1039, + 1044, + 1054, + 1045, + 1055, + 1056, + 1045, + 1037, + 1050, + 102, + 1037, + 1055, + 1056, + 1054, + 1051, + 1050, + 1051, + 1049, + 1041, + 1054, + 102, + 1055, + 1051, + 1048, + 1051, + 1049, + 1051, + 1050, + 102, + 1040, + 1045, + 1040, + 1045, + 1040, + 1045, + 1040, + 1045, + 1040, + 1045, + 102, + 1049, + 1041, + 1054, + 1039, + 1061, + 102, + ] + gold_input_maskgold_segment_ids = [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 10, + 10, + 10, + 10, + 10, + 10, + ] + gold_labels_mask = [ + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ] + gold_input_ids_for_subwords = [ + 101, + 26357, + 2106, + 2666, + 2061, + 8202, + 1998, + 13012, + 16643, + 2319, + 1043, + 7174, + 102, + 2106, + 3771, + 7842, + 2819, + 2239, + 102, + 28625, + 3630, + 9856, + 102, + 9822, + 26458, + 7174, + 2102, + 102, + 13012, + 13473, + 11393, + 102, + 13813, + 3207, + 102, + 3017, + 102, + 15211, + 102, + 9168, + 102, + 2106, + 28173, + 4305, + 4305, + 102, + 8673, + 102, + ] + gold_input_mask_for_subwords = [ + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + ] + gold_segment_ids_for_subwords = [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 3, + 3, + 3, + 3, + 3, + 4, + 4, + 4, + 4, + 5, + 5, + 5, + 6, + 6, + 7, + 7, + 8, + 8, + 9, + 9, + 9, + 9, + 9, + 10, + 10, + ] + gold_character_pos_to_subword_pos = [ + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 3, + 3, + 3, + 4, + 4, + 5, + 5, + 5, + 5, + 6, + 6, + 6, + 6, + 7, + 7, + 7, + 8, + 8, + 8, + 9, + 9, + 9, + 10, + 11, + 11, + 11, + 12, + 13, + 13, + 13, + 14, + 14, + 14, + 14, + 15, + 15, + 16, + 16, + 17, + 17, + 18, + 19, + 19, + 19, + 19, + 19, + 20, + 20, + 21, + 21, + 21, + 22, + 23, + 23, + 23, + 23, + 23, + 23, + 23, + 23, + 24, + 24, + 24, + 25, + 25, + 25, + 26, + 27, + 28, + 28, + 28, + 29, + 29, + 29, + 30, + 30, + 30, + 31, + 32, + 32, + 32, + 32, + 33, + 33, + 34, + 35, + 35, + 35, + 35, + 35, + 35, + 35, + 35, + 35, + 36, + 37, + 37, + 37, + 37, + 37, + 37, + 37, + 37, + 37, + 37, + 38, + 39, + 39, + 39, + 39, + 39, + 39, + 39, + 40, + 41, + 41, + 41, + 42, + 42, + 42, + 43, + 43, + 44, + 44, + 45, + 46, + 46, + 46, + 46, + 46, + 47, + ] + + tags = [0 for _ in hyp.split()] + for p, t in zip(span_info_parts, targets): + c, start, end = p.split(" ") + start = int(start) + end = int(end) + tags[start:end] = [t for i in range(end - start)] + + # get input features for characters + (input_ids, input_mask, segment_ids, labels_mask, labels, _, _,) = bert_example_builder._get_input_features( + hyp=hyp, ref=ref, tags=tags + ) + + # get input features for words + hyp_with_words = hyp.replace(" ", "").replace("_", " ") + ref_with_words = ref.replace(" ", "").replace("_", " ") + ( + input_ids_for_subwords, + input_mask_for_subwords, + segment_ids_for_subwords, + _, + _, + _, + _, + ) = bert_example_builder._get_input_features(hyp=hyp_with_words, ref=ref_with_words, tags=None) + + character_pos_to_subword_pos = bert_example_builder._map_characters_to_subwords(input_ids, input_ids_for_subwords) + + assert tags == gold_tags + assert input_ids == gold_input_ids + assert input_mask == gold_input_mask + assert segment_ids == gold_segment_ids + assert labels_mask == gold_labels_mask + assert input_ids_for_subwords == gold_input_ids_for_subwords + assert input_mask_for_subwords == gold_input_mask_for_subwords + assert segment_ids_for_subwords == gold_segment_ids_for_subwords + assert character_pos_to_subword_pos == gold_character_pos_to_subword_pos diff --git a/tools/ctc_segmentation/scripts/prepare_data.py b/tools/ctc_segmentation/scripts/prepare_data.py index 429b642d5ba0..c6ea024273fb 100644 --- a/tools/ctc_segmentation/scripts/prepare_data.py +++ b/tools/ctc_segmentation/scripts/prepare_data.py @@ -151,7 +151,7 @@ def split_text( ) # end of quoted speech - to be able to split sentences by full stop - transcript = re.sub(r"([\.\?\!])([\"\'])", r"\g<2>\g<1> ", transcript) + transcript = re.sub(r"([\.\?\!])([\"\'”])", r"\g<2>\g<1> ", transcript) # remove extra space transcript = re.sub(r" +", " ", transcript) diff --git a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb new file mode 100644 index 000000000000..189ac958d377 --- /dev/null +++ b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb @@ -0,0 +1,1403 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "PiRuohn_FQco" + }, + "source": [ + "# Overview\n", + "This tutorial demonstrates how to run inference with SpellMapper - a model for Spellchecking ASR (Automatic Speech Recognition) Customization.\n", + "\n", + "Estimated time: 10-15 min.\n", + "\n", + "SpellMapper is a non-autoregressive (NAR) model based on transformer architecture ([BERT](https://arxiv.org/pdf/1810.04805.pdf) with multiple separators).\n", + "It gets as input a single ASR hypothesis (text) and a **custom vocabulary** and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any.\n", + "\n", + "This model is an alternative to word boosting/shallow fusion approaches:\n", + " - does not require retraining ASR model;\n", + " - does not require beam-search/language model(LM);\n", + " - can be applied on top of any English ASR model output;" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qm5wmxVEGXgH" + }, + "source": [ + "## What is custom vocabulary?\n", + "**Custom vocabulary** is a list of words/phrases that are important for a particular user. For example, user's contact names, playlist, selected terminology and so on. The size of the custom vocabulary can vary from several hundreds to **several thousand entries** - but this is not an equivalent to ngram language model.\n", + "\n", + "![Scope of customization with user vocabulary](images/spellmapper_customization_vocabulary.png)\n", + "\n", + "Note that unlike traditional spellchecking approaches, which aim to correct known words using language models, the goal of contextual spelling correction is to correct highly specific user terms, most of which can be 1) out-of-vocabulary (OOV) words, 2) spelling variations (e.g., \"John Koehn\", \"Jon Cohen\") and language models cannot help much with that." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D5_XwuXDOKho" + }, + "source": [ + "## Tutorial Plan\n", + "\n", + "1. Create a sample custom vocabulary using some medical terminology.\n", + "2. Study what customization does - a detailed analysis of a small example.\n", + "3. Run a bigger example:\n", + " * Create sample ASR results by running TTS (text-to-speech synthesis) + ASR on some medical paper abstracts.\n", + " * Run SpellMapper inference and show how it can improve ASR results using custom vocabulary.\n", + "\n", + "TL;DR We reduce WER from `14.3%` to `11.4%` by correcting medical terms, e.g.\n", + "* `puramesin` => `puromycin`\n", + "* `parromsin` => `puromycin`\n", + "* `and hydrod` => `anhydride`\n", + "* `lesh night and` => `lesch-nyhan`\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "agz8B2CxXBBG" + }, + "source": [ + "# Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "koRPpYISNPuH" + }, + "source": [ + "## Installing NeMo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HCnnz3cgVc4Q" + }, + "outputs": [], + "source": [ + "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines\n", + "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n", + "GITHUB_ACCOUNT = \"bene-ges\"\n", + "BRANCH = \"spellchecking_asr_customization_double_bert\"\n", + "!python -m pip install git+https://github.com/{GITHUB_ACCOUNT}/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]\n", + "\n", + "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", + "# comment out the below lines and set NEMO_DIR to your local path.\n", + "NEMO_DIR = 'nemo'\n", + "!git clone -b {BRANCH} https://github.com/{GITHUB_ACCOUNT}/NeMo.git $NEMO_DIR" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_M92gCn_NW1_" + }, + "source": [ + "## Additional installs\n", + "We will use `sentence_splitter` to split abstracts to sentences." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ddyJA3NtGl9C" + }, + "outputs": [], + "source": [ + "!pip install sentence_splitter" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qVa91rGkeFje" + }, + "source": [ + "Clone the SpellMapper model from HuggingFace.\n", + "Note that we will need not only the checkpoint itself, but also the ngram mapping vocabulary `replacement_vocab_filt.txt` from the same folder." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JiI9dkEm5cpW" + }, + "outputs": [], + "source": [ + "!git clone https://huggingface.co/bene-ges/spellmapper_asr_customization_en" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8saqFOePVfFf" + }, + "source": [ + "## Imports\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tAJyiYn_VnrF" + }, + "outputs": [], + "source": [ + "import IPython.display as ipd\n", + "import json\n", + "import random\n", + "import re\n", + "import soundfile as sf\n", + "import torch\n", + "\n", + "from collections import Counter, defaultdict\n", + "from difflib import SequenceMatcher\n", + "from matplotlib.pyplot import imshow\n", + "from matplotlib import pyplot as plt\n", + "from sentence_splitter import SentenceSplitter\n", + "from typing import List, Set, Tuple\n", + "\n", + "from nemo.collections.tts.models import FastPitchModel\n", + "from nemo.collections.tts.models import HifiGanModel\n", + "\n", + "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest\n", + "\n", + "from nemo.collections.nlp.data.spellchecking_asr_customization.utils import (\n", + " get_all_candidates_coverage,\n", + " get_index,\n", + " load_ngram_mappings,\n", + " search_in_index,\n", + " get_candidates,\n", + " read_spellmapper_predictions,\n", + " apply_replacements_to_text,\n", + " load_ngram_mappings_for_dp,\n", + " get_alignment_by_dp,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mfAaOdAWUGUV" + }, + "source": [ + "Use seed to get a reproducible behaviour." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UlGnNKTuT_6A" + }, + "outputs": [], + "source": [ + "random.seed(0)\n", + "torch.manual_seed(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RPPHI7Zd_fDz" + }, + "source": [ + "## Download data\n", + "\n", + "File `pubmed23n0009.xml` taken from public ftp server of https://www.ncbi.nlm.nih.gov/pmc/ contains information about 5593 medical papers, from which we extract only their abstracts. We will feed sentences from there to TTS + ASR to get initial ASR results.\n", + "\n", + "File `wordlist.txt` contains 100k **single-word** medical terms.\n", + "\n", + "File `valid_adam.txt` contains 24k medical abbreviations with their full forms. We will use those full forms as examples of **multi-word** medical terms.\n", + "\n", + "File `count_1w.txt` contains 330k single words with their frequencies from Google Ngrams corpus. We will use this file to filter out frequent words from our custom vocabulary.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mX6cvE8xw2n1" + }, + "outputs": [], + "source": [ + "!wget https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed23n0009.xml.gz\n", + "!gunzip pubmed23n0009.xml.gz\n", + "!grep \"AbstractText\" pubmed23n0009.xml > abstract.txt\n", + "\n", + "!wget https://raw.githubusercontent.com/McGill-NLP/medal/master/toy_data/valid_adam.txt\n", + "!wget https://raw.githubusercontent.com/glutanimate/wordlist-medicalterms-en/master/wordlist.txt\n", + "!wget https://norvig.com/ngrams/count_1w.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mBm9BeqNaRlC" + }, + "source": [ + "## Auxiliary functions\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kVUKhSh48Ypi" + }, + "outputs": [], + "source": [ + "CHARS_TO_IGNORE_REGEX = re.compile(r\"[\\.\\,\\?\\:!;()«»…\\]\\[/\\*–‽+&_\\\\½√>€™$•¼}{~—=“\\\"”″‟„]\")\n", + "\n", + "\n", + "def get_medical_vocabulary() -> Tuple[Set[str], Set[str]]:\n", + " \"\"\"This function builds a vocabulary of medical terms using downloaded sources:\n", + " wordlist.txt - 100k single-word medical terms.\n", + " valid_adam.txt - 24k medical abbreviations with their full forms. We use those full forms as examples of multi-word medical terms.\n", + " count_1w.txt - 330k single words with their frequencies from Google Ngrams corpus. We will use this file to filter out frequent words from our custom vocabulary.\n", + " \"\"\"\n", + " common_words = set()\n", + " with open(\"count_1w.txt\", \"r\", encoding=\"utf-8\") as f:\n", + " for line in f:\n", + " word, freq = line.strip().casefold().split(\"\\t\")\n", + " if int(freq) < 500000:\n", + " break\n", + " common_words.add(word)\n", + " print(\"Size of common words vocabulary:\", len(common_words))\n", + "\n", + " abbreviations = defaultdict(set)\n", + " medical_vocabulary = set()\n", + " with open(\"valid_adam.txt\", \"r\", encoding=\"utf-8\") as f:\n", + " lines = f.readlines()\n", + " # first line is header\n", + " for line in lines[1:]:\n", + " abbrev, _, phrase = line.strip().split(\"\\t\")\n", + " # skip phrases longer than 3 words because some of them are long explanations\n", + " if phrase.count(\" \") > 2:\n", + " continue\n", + " if phrase in common_words:\n", + " continue\n", + " medical_vocabulary.add(phrase)\n", + " abbrev = abbrev.lower()\n", + " abbreviations[abbrev].add(phrase)\n", + "\n", + " with open(\"wordlist.txt\", \"r\", encoding=\"utf-8\") as f:\n", + " for line in f:\n", + " word = line.strip().casefold()\n", + " # skip words contaning digits\n", + " if re.match(r\".*\\d.*\", word):\n", + " continue\n", + " if re.match(r\".*[\\[\\]\\(\\)\\+\\,\\.].*\", word):\n", + " continue\n", + " if word in common_words:\n", + " continue\n", + " medical_vocabulary.add(word)\n", + "\n", + " print(\"Size of medical vocabulary:\", len(medical_vocabulary))\n", + " print(\"Size of abbreviation vocabulary:\", len(abbreviations))\n", + " return medical_vocabulary, abbreviations\n", + "\n", + "\n", + "def read_abstracts(medical_vocabulary: Set[str]) -> Tuple[List[str], Set[str], Set[str]]:\n", + " \"\"\"This function reads the downloaded medical abstracts, and extracts sentences containing any word/phrase from the medical vocabulary.\n", + " Args:\n", + " medical_vocabulary: set of known medical words or phrases\n", + " Returns:\n", + " sentences: list of extracted sentences\n", + " all_found_singleword: set of single words from medical vocabulary that occurred at least in one sentence\n", + " all_found_multiword: set of multi-word phrases from medical vocabulary that occurred at least in one sentence\n", + " \"\"\"\n", + " splitter = SentenceSplitter(language='en')\n", + "\n", + " all_sentences = []\n", + " all_found_singleword = set()\n", + " all_found_multiword = set()\n", + " with open(\"abstract.txt\", \"r\", encoding=\"utf-8\") as f:\n", + " for line in f:\n", + " text = line.strip().replace(\"\", \"\").replace(\"\", \"\")\n", + " sents = splitter.split(text)\n", + " found_singleword = set()\n", + " found_multiword = set()\n", + " for sent in sents:\n", + " # remove anything in brackets from text\n", + " sent = re.sub(r\"\\(.+\\)\", r\"\", sent)\n", + " # remove quotes from text\n", + " sent = sent.replace(\"\\\"\", \"\")\n", + " # skip sentences contaning digits because normalization is out of scope of this tutorial\n", + " if re.match(r\".*\\d.*\", sent):\n", + " continue\n", + " # skip sentences contaning abbreviations with period inside the sentence (for the same reason)\n", + " if \". \" in sent:\n", + " continue\n", + " # skip long sentences as they may cause OOM issues\n", + " if len(sent) > 150:\n", + " continue\n", + " # replace all punctuation to space and convert to lowercase\n", + " sent_clean = CHARS_TO_IGNORE_REGEX.sub(\" \", sent).lower()\n", + " sent_clean = \" \".join(sent_clean.split(\" \"))\n", + " words = sent_clean.split(\" \")\n", + "\n", + " found_phrases = set()\n", + " for begin in range(len(words)):\n", + " for end in range(begin + 1, min(begin + 4, len(words))):\n", + " phrase = \" \".join(words[begin:end])\n", + " if phrase in medical_vocabulary:\n", + " found_phrases.add(phrase)\n", + " if end - begin == 1:\n", + " found_singleword.add(phrase)\n", + " else:\n", + " found_multiword.add(phrase)\n", + " if len(found_phrases) > 0:\n", + " all_sentences.append((sent, \";\".join(found_phrases)))\n", + " all_found_singleword = all_found_singleword.union(found_singleword)\n", + " all_found_multiword = all_found_multiword.union(found_multiword)\n", + "\n", + " print(\"Sentences:\", len(all_sentences))\n", + " print(\"Unique single-word terms found:\", len(all_found_singleword))\n", + " print(\"Unique multi-word terms found:\", len(all_found_multiword))\n", + " print(\"Examples of multi-word terms\", str(list(all_found_multiword)[0:10]))\n", + " \n", + " return all_sentences, all_found_singleword, all_found_multiword" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XU3xeCBVpWOL" + }, + "outputs": [], + "source": [ + "def get_fragments(i_words: List[str], j_words: List[str]) -> List[Tuple[str, str, str, int, int, int, int]]:\n", + " \"\"\"This function is used to compare two word sequences to find minimal fragments that differ.\n", + " Args:\n", + " i_words: list of words in first sequence\n", + " j_words: list of words in second sequence\n", + " Returns:\n", + " list of tuples (difference_type, fragment1, fragment2, begin_of_fragment1, end_of_fragment1, begin_of_fragment2, end_of_fragment2)\n", + " \"\"\"\n", + " s = SequenceMatcher(None, i_words, j_words)\n", + " result = []\n", + " for tag, i1, i2, j1, j2 in s.get_opcodes():\n", + " result.append((tag, \" \".join(i_words[i1:i2]), \" \".join(j_words[j1:j2]), i1, i2, j1, j2))\n", + " result = sorted(result, key=lambda x: x[3])\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2ydXp_pFYmYu" + }, + "source": [ + "## Read medical data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WAeauax0SV1-" + }, + "outputs": [], + "source": [ + "medical_vocabulary, _ = get_medical_vocabulary()\n", + "sentences, found_singleword, found_multiword = read_abstracts(medical_vocabulary)\n", + "# in case if we need random candidates from a big sample - we will use full medical vocabulary for that purpose.\n", + "big_sample = list(medical_vocabulary)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FRli7-Kx7sOO" + }, + "outputs": [], + "source": [ + "for sent, phrases in sentences[0:10]:\n", + " print(sent, \"\\t\", phrases)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rL1VqH2_dk93" + }, + "source": [ + "# SpellMapper ASR Customization\n", + "\n", + "SpellMapper model relies on two offline preparation steps:\n", + "1. Collecting n-gram mappings from a large corpus (this mappings vocabulary had been collected once on a large corpus and is supplied with the model).\n", + "2. Indexing of user vocabulary by n-grams.\n", + "\n", + "![Offline data preparation](images/spellmapper_data_preparation.png)\n", + "\n", + "At inference time we take as input an ASR hypothesis and an n-gram-indexed user vocabulary and perform following steps:\n", + "1. Retrieve the top 10 candidate phrases from the user vocabulary that are likely to be contained in the given ASR-hypothesis, possibly in a misspelled form.\n", + "2. Run the neural model that tags the input characters with correct candidate labels or 0 if no match is found.\n", + "3. Do post-processing to combine results.\n", + "\n", + "![Inference pipeline](images/spellmapper_inference_pipeline.png)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OeJpsMwslmrd" + }, + "source": [ + "## N-gram mappings\n", + "Note that n-gram mappings vocabulary had been collected from a large corpus and is supplied with the model. It is supposed to be \"universal\" for English language.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uH6p0mOd12pi" + }, + "source": [ + "Let's see what n-gram mappings are like, for example, for an n-gram `l u c`.\n", + "Note that n-grams in `replacement_vocab_filt.txt` preserve one-to-one correspondence between original letters and misspelled fragments (this additional markup is handled during loading). \n", + "* `+` means that adjacent letters are concatenated and correspond to a single source letter. \n", + "* `` means that the original letter is deleted. \n", + "This auxiliary markup will be removed automatically during loading.\n", + "\n", + "`_` is used instead of real space symbol.\n", + "\n", + "Last three columns are:\n", + "* joint frequency\n", + "* frequency of original n-gram\n", + "* frequency of misspelled n-gram\n", + "\n", + "$$\\frac{JointFrequency}{SourceFrequency}=TranslationProbability$$\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qul163dB1sKp" + }, + "outputs": [], + "source": [ + "!awk 'BEGIN {FS=\"\\t\"} ($1==\"l u c\"){print $0}' < spellmapper_asr_customization_en/replacement_vocab_filt.txt | sort -t$'\\t' -k3nr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eWxcrVWZ3Pfq" + }, + "source": [ + "Now we read n-gram mappings from the file. Parameter `max_misspelled_freq` controls maximum frequency of misspelled n-grams. N-grams more frequent than that are put in the list of banned n-grams and won't be used in indexing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WHKhE945-N7o" + }, + "outputs": [], + "source": [ + "print(\"load n-gram mappings...\")\n", + "ngram_mapping_vocab, ban_ngram = load_ngram_mappings(\"spellmapper_asr_customization_en/replacement_vocab_filt.txt\", max_misspelled_freq=125000)\n", + "# CAUTION: entries in ban_ngram end with a space and can contain \"+\" \"=\"\n", + "print(\"Size of ngram mapping vocabulary:\", len(ngram_mapping_vocab))\n", + "print(\"Size of banned ngrams:\", len(ban_ngram))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "49IcMBfllvXN" + }, + "source": [ + "## Indexing of custom vocabulary" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b1K6paeee2Iu" + }, + "source": [ + "As we mentioned earlier, this model pipeline is intended to work with custom vocabularies up to several thousand entries. Since the whole medical vocabulary contains 110k entries, we restrict our custom vocabulary to 5000+ terms that occured in given corpus of abstracts.\n", + "\n", + "The goal of indexing our custom vocabulary is to build an index where key is a letter n-gram and value is the whole phrase. The keys are n-grams in the given user phrase and their misspelled variants taken from our collection of n-\n", + "gram mappings (see Index of custom vocabulary in Fig. 1)\n", + "\n", + "*Though it is possible to index and search the whole 110k vocabulary, it will require additional optimizations and is beyond the scope of this tutorial.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xWb0jGqw6Woi" + }, + "outputs": [], + "source": [ + "custom_phrases = []\n", + "for phrase in medical_vocabulary:\n", + " if phrase not in found_singleword and phrase not in found_multiword:\n", + " continue\n", + " custom_phrases.append(\" \".join(list(phrase.replace(\" \", \"_\"))))\n", + "print(\"Size of customization vocabulary:\", len(custom_phrases))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UHWor5pD2Eyb" + }, + "source": [ + "Now we build the index for our custom phrases.\n", + "\n", + "Parameter `min_log_prob` controls minimum log probability, after which we stop growing this n-gram.\n", + "\n", + "Parameter `max_phrases_per_ngram` controls maximum number of phrases that can be indexed by one ngram. N-grams exceeding this limit are also banned and not used in indexing.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hs4RDXj0-xW9" + }, + "outputs": [], + "source": [ + "phrases, ngram2phrases = get_index(custom_phrases, ngram_mapping_vocab, ban_ngram, min_log_prob=-4.0, max_phrases_per_ngram=600)\n", + "print(\"Size of phrases:\", len(phrases))\n", + "print(\"Size of ngram2phrases:\", len(ngram2phrases))\n", + "\n", + "# Save index to file - later we will use it in other script\n", + "with open(\"index.txt\", \"w\", encoding=\"utf-8\") as out:\n", + " for ngram in ngram2phrases:\n", + " for phrase_id, begin, size, logprob in ngram2phrases[ngram]:\n", + " phrase = phrases[phrase_id]\n", + " out.write(ngram + \"\\t\" + phrase + \"\\t\" + str(begin) + \"\\t\" + str(size) + \"\\t\" + str(logprob) + \"\\n\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RV1sdQ9rvar8" + }, + "source": [ + "## Small detailed example\n", + "\n", + "Let's consider, for example, one custom phrase `thoracic aorta` and an incorrect ASR-hypothesis `the tarasic oorda is a part of the aorta located in the thorax`, containing a misspelled phrase `tarasic_oorda`. \n", + "\n", + "We will see \n", + "1. How this custom phrase is indexed.\n", + "2. How candidate retrieval works, given ASR-hypothesis.\n", + "3. How inference and post-processing work.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kGBTTJXixnrG" + }, + "source": [ + "### N-grams in index" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ryfUlqNMl4vQ" + }, + "source": [ + "Let's look, for example, by what n-grams a custom phrase `thoracic aorta` is indexed. \n", + "Columns: \n", + "1. n-gram\n", + "2. beginning position in the phrase\n", + "3. length\n", + "4. log probability\n", + "\n", + "Note that many n-grams are not from n-gram mappings file. Those are derived by growing previous n-grams with new replacements. In this case log probabilities are summed up. Growing stops, when minimum log prob is exceeded.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x0ZVsXGBo8pt" + }, + "outputs": [], + "source": [ + "for ngram in ngram2phrases:\n", + " for phrase_id, b, length, lprob in ngram2phrases[ngram]:\n", + " if phrases[phrase_id] == \"t h o r a c i c _ a o r t a\":\n", + " print(ngram.ljust(16) + \"\\t\" + str(b).rjust(4) + \"\\t\" + str(length).rjust(4) + \"\\t\" + str(lprob))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "20ov23ze4xeQ" + }, + "source": [ + "### Candidate retrieval\n", + "Candidate retrieval tasks are:\n", + " - Given an input sentence and an index of custom vocabulary find all n-grams from the index matching the sentence. \n", + " - Find which sentence fragments and which custom phrases have most \"hits\" - potential candidates.\n", + " - Find approximate starting position for each candidate phrase. \n", + "\n", + "\n", + "Let's look at the hits, that phrase \"thoracic aorta\" gets by searching all ngrams in the input text. We can see some hits in different part of the sentence, but a moving window can find a fragment with most hits." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "t_rhKQ3Xqa8A" + }, + "outputs": [], + "source": [ + "sent = \"the_tarasic_oorda_is_a_part_of_the_aorta_located_in_the_thorax\"\n", + "phrases2positions, position2ngrams = search_in_index(ngram2phrases, phrases, sent)\n", + "print(\" \".join(list(sent)))\n", + "print(\" \".join(list(map(str, phrases2positions[phrases.index(\"t h o r a c i c _ a o r t a\")].astype(int)))))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "orkRapbjF4aZ" + }, + "source": [ + "`phrases2positions` is a matrix of size (len(phrases), len(ASR_hypothesis)).\n", + "It is filled with 1.0 (hits) on intersection of letter n-grams and phrases that are indexed by these n-grams, 0.0 - elsewhere.\n", + "It is used to find phrases with many hits within a contiguous window - potential matching candidates.\n", + "\n", + "`position2ngrams` is a list of sets of ngrams. List index is the starting position in the ASR-hypothesis.\n", + "It is used later to check how well each found candidate is covered by n-grams (to avoid cases where some repeating n-gram gives many hits to a phrase, but the phrase itself is not well covered)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JF7u4_iiHLyI" + }, + "outputs": [], + "source": [ + "candidate2coverage, candidate2position = get_all_candidates_coverage(phrases, phrases2positions)\n", + "print(\"Coverage=\", candidate2coverage[phrases.index(\"t h o r a c i c _ a o r t a\")])\n", + "print(\"Starting position=\", candidate2position[phrases.index(\"t h o r a c i c _ a o r t a\")])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "45mvKg8ZyNbr" + }, + "source": [ + "`candidate2coverage` is a list of size len(phrases) containing coverage (0.0 to 1.0) in best window.\n", + "Coverage is a smoothed percentage of hits in the window of size of the given phrase.\n", + "\n", + "`candidate2position` is a list of size len(phrases) containing starting position of best window.\n", + "\n", + "Starting position is approximate, it's ok. If it is not at the beginning of some word, SpellMapper will try to adjust it later. In this particular example we get 5 as starting position instead of 4, missing the first letter." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Sjyn9I98udL9" + }, + "source": [ + "### Inference\n", + "\n", + "Now let's generate input for SpellMapper inference. \n", + "An input line should consist of 4 tab-separated columns:\n", + " - text of ASR-hypothesis\n", + " - texts of 10 candidates separated by semicolon\n", + " - 1-based ids of non-dummy candidates\n", + " - approximate start/end coordinates of non-dummy candidates (correspond to ids)\n", + "Note that candidate retrieval is done inside the function `get_candidates`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cJnusVfBRhRX" + }, + "outputs": [], + "source": [ + "out = open(\"spellmapper_input.txt\", \"w\", encoding=\"utf-8\")\n", + "letters = list(sent)\n", + "candidates = get_candidates(ngram2phrases, phrases, letters, big_sample)\n", + "# We add two columns with targets and span_info. \n", + "# They have same format as during training, but start and end positions are APPROXIMATE, they will be adjusted when constructing BertExample.\n", + "targets = []\n", + "span_info = []\n", + "for idx, c in enumerate(candidates):\n", + " if c[1] == -1:\n", + " continue\n", + " targets.append(str(idx + 1)) # targets are 1-based\n", + " start = c[1]\n", + " end = min(c[1] + c[2], len(letters)) # ensure that end is not outside sentence length (it can happen because c[2] is candidate length used as approximation)\n", + " span_info.append(\"CUSTOM \" + str(start) + \" \" + str(end))\n", + "\n", + "out.write(\" \".join(letters) + \"\\t\" + \";\".join([x[0] for x in candidates]) + \"\\t\" + \" \".join(targets) + \"\\t\" + \";\".join(span_info) + \"\\n\")\n", + "out.close()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Qpei5o89SmaU" + }, + "outputs": [], + "source": [ + "!cat spellmapper_input.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9rAmO15SS6go" + }, + "outputs": [], + "source": [ + "!python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \\\n", + " pretrained_model=spellmapper_asr_customization_en/training_10m_5ep.nemo \\\n", + " model.max_sequence_len=512 \\\n", + " inference.from_file=spellmapper_input.txt \\\n", + " inference.out_file=spellmapper_output.txt \\\n", + " inference.batch_size=16 \\\n", + " lang=en\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wd2aq4T1N5cs" + }, + "source": [ + "Each line in SpellMapper output is tab-separated and consists of 4 columns:\n", + "1. ASR-hypothesis (same as in input)\n", + "2. 10 candidates separated with semicolon (same as in input)\n", + "3. fragment predictions, separated with semicolon, each prediction is a tuple (start, end, candidate_id, probability)\n", + "4. letter predictions - candidate_id predicted for each letter (this is only for debug purposes)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ravgEX8cTFty" + }, + "outputs": [], + "source": [ + "!cat spellmapper_output.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "az26364-PHb2" + }, + "source": [ + "We can use some utility functions to apply found replacements and get actual corrected text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lPtFa_EhK8pb" + }, + "outputs": [], + "source": [ + "spellmapper_results = read_spellmapper_predictions(\"spellmapper_output.txt\")\n", + "text, replacements, _ = spellmapper_results[0]\n", + "corrected_text = apply_replacements_to_text(text, replacements, replace_hyphen_to_space=False)\n", + "print(\"Text before correction:\\n\", text)\n", + "print(\"Text after correction:\\n\", corrected_text)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "efF7O-D91FLX" + }, + "source": [ + "# Bigger customization example\n", + "\n", + "Let's test customization on more data. The plan is\n", + " * Get baseline ASR transcriptions by running TTS + ASR on some medical paper abstracts.\n", + " * Run SpellMapper inference and show how it can improve ASR results using custom vocabulary.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "r_EFPnyDcXZt" + }, + "source": [ + "## Run TTS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "i9F5SBhmr8rk" + }, + "outputs": [], + "source": [ + "# create a folder for wav files (TTS output)\n", + "!rm -r audio\n", + "!mkdir audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JMbkNVt7YBAO" + }, + "outputs": [], + "source": [ + "if torch.cuda.is_available():\n", + " device = \"cuda\"\n", + "else:\n", + " device = \"cpu\"\n", + "\n", + "# Load FastPitch from HuggingFace\n", + "spectrogram_generator = FastPitchModel.from_pretrained(\"nvidia/tts_en_fastpitch\").eval().to(device)\n", + "# Load HifiGan vocoder from HuggingFace\n", + "vocoder = HifiGanModel.from_pretrained(model_name=\"nvidia/tts_hifigan\").eval().to(device)\n", + "\n", + "# Write sentences that we want to feed to TTS\n", + "with open(\"tts_input.txt\", \"w\", encoding=\"utf-8\") as out:\n", + " for sent, _ in sentences[0:100]:\n", + " out.write(sent + \"\\n\")\n", + "\n", + "out_manifest = open(\"manifest.json\", \"w\", encoding=\"utf-8\")\n", + "i = 0\n", + "with open(\"tts_input.txt\", \"r\", encoding=\"utf-8\") as inp:\n", + " for line in inp:\n", + " text = line.strip()\n", + " text_clean = CHARS_TO_IGNORE_REGEX.sub(\" \", text).lower() #replace all punctuation to space and convert to lowercase\n", + " text_clean = \" \".join(text_clean.split())\n", + "\n", + " parsed = spectrogram_generator.parse(text, normalize=True)\n", + "\n", + " spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)\n", + " audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)\n", + "\n", + " # Note that vocoder return a batch of audio. In this example, we just take the first and only sample.\n", + " filename = \"audio/\" + str(i) + \".wav\"\n", + " sf.write(filename, audio.to('cpu').detach().numpy()[0], 16000)\n", + " out_manifest.write(\n", + " \"{\\\"audio_filepath\\\": \\\"\" + filename + \"\\\", \\\"text\\\": \\\"\" + text_clean + \"\\\", \\\"orig_text\\\": \\\"\" + text + \"\\\"}\\n\"\n", + " )\n", + " i += 1\n", + "\n", + " # display some examples\n", + " if i < 10:\n", + " print(f'\"{text}\"\\n')\n", + " ipd.display(ipd.Audio(audio.to('cpu').detach(), rate=22050))\n", + "\n", + "out_manifest.close()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9T3CZcCAmxCz" + }, + "source": [ + "Now we have a folder with generated audios `audio/*.wav` and a nemo manifest with json records like `{\"audio_filepath\": \"audio/0.wav\", \"text\": \"no renal auditory or vestibular toxicity was observed\", \"orig_text\": \"No renal, auditory, or vestibular toxicity was observed.\"}`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pR_T1HnttVjm" + }, + "outputs": [], + "source": [ + "lines = []\n", + "with open(\"manifest.json\", \"r\", encoding=\"utf-8\") as f:\n", + " lines = f.readlines()\n", + "\n", + "for line in lines:\n", + " try:\n", + " data = json.loads(line.strip())\n", + " except:\n", + " print(line)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Free GPU memory to avoid OOM." + ], + "metadata": { + "id": "bt2TMLLvdUHm" + } + }, + { + "cell_type": "code", + "source": [ + "del spectrogram_generator\n", + "del vocoder\n", + "torch.cuda.empty_cache()" + ], + "metadata": { + "id": "ZwEpAOCaRH7s" + }, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HrensakWdLkt" + }, + "source": [ + "## Run baseline ASR" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IQNIo2M_mqJc" + }, + "source": [ + "Next we transcribe our .wav files with a general domain [ASR model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_conformer_ctc_large). It will generate an output file `ctc_baseline_transcript.json` where the predicted transcriptions are stored in the field `pred_text` of each record.\n", + "\n", + "Note that this ASR model was not trained or fine-tuned on medical domain, so we expect it to make mistakes on medical terms." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NMN63ux1mJiG" + }, + "outputs": [], + "source": [ + "!python nemo/examples/asr/transcribe_speech.py \\\n", + " pretrained_name=\"stt_en_conformer_ctc_large\" \\\n", + " dataset_manifest=manifest.json \\\n", + " output_filename=ctc_baseline_transcript_tmp.json \\\n", + " batch_size=2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L3swQ8uqqgnp" + }, + "source": [ + "ATTENTION: SpellMapper relies on words to be separated by _single_ space\n", + "\n", + "There is a bug with multiple space, observed in ASR results produced by Conformer-CTC, probably connected to this issue: https://github.com/NVIDIA/NeMo/issues/4034.\n", + "\n", + "So we need to correct the manifests to ensure that all spaces are single." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "z17sxkmXrXpJ" + }, + "outputs": [], + "source": [ + "test_data = read_manifest(\"ctc_baseline_transcript_tmp.json\")\n", + "\n", + "for i in range(len(test_data)):\n", + " # if there are multiple spaces in the string they will be merged to one\n", + " test_data[i][\"pred_text\"] = \" \".join(test_data[i][\"pred_text\"].split())\n", + "\n", + "with open(\"ctc_baseline_transcript.json\", \"w\", encoding=\"utf-8\") as out:\n", + " for d in test_data:\n", + " line = json.dumps(d)\n", + " out.write(line + \"\\n\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PuKtfhbVkVJY" + }, + "outputs": [], + "source": [ + "!head -n 4 ctc_baseline_transcript.json" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aCJw9NEXqRg8" + }, + "source": [ + "### Calculating WER of baseline transcript\n", + "We use the standard script from NeMo to calculate WER and CER of our baseline transcript. Internally it compares the text in `pred_text` (predicted transcript) to `text` (reference transcript). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZmNEGVWQsGo2" + }, + "outputs": [], + "source": [ + "!python nemo/examples/asr/speech_to_text_eval.py \\\n", + " dataset_manifest=ctc_baseline_transcript.json \\\n", + " only_score_manifest=True\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AvPwJr0ZqdkN" + }, + "source": [ + "### See fragments that differ\n", + "We use SequenceMatcher to see fragments that differ. (Another option is to use a more powerful analytics tool [Speech Data Explorer](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tools/speech_data_explorer.html))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RAeaVCpMv78y" + }, + "outputs": [], + "source": [ + "test_data = read_manifest(\"ctc_baseline_transcript.json\")\n", + "pred_text = [data['pred_text'] for data in test_data]\n", + "ref_text = [data['text'] for data in test_data]\n", + "audio_filepath = [data['audio_filepath'] for data in test_data]\n", + "\n", + "diff_vocab = Counter()\n", + "\n", + "for i in range(len(test_data)):\n", + " ref_sent = \" \" + ref_text[i] + \" \"\n", + " pred_sent = \" \" + pred_text[i] + \" \"\n", + "\n", + " pred_words = pred_sent.strip().split()\n", + " ref_words = ref_sent.strip().split()\n", + "\n", + " for tag, hyp_fragment, ref_fragment, i1, i2, j1, j2 in get_fragments(pred_words, ref_words):\n", + " if tag != \"equal\":\n", + " diff_vocab[(tag, hyp_fragment, ref_fragment)] += 1\n", + "\n", + "sum_ = 0\n", + "print(\"PRED vs REF\")\n", + "for k, v in diff_vocab.most_common(1000000):\n", + " sum_ += v\n", + " print(k, v, \"sum=\", sum_)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dUSOF7iD1w_9" + }, + "source": [ + "## Run SpellMapper" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "x39BQhYB6_Fr" + }, + "source": [ + "Now we run retrieval on our input manifest and prepare input for SpellMapper inference. Note that we use index of custom vocabulary (file `index.txt` that we saved earlier)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y8x-yT5WqfFz" + }, + "outputs": [], + "source": [ + "!python nemo/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py \\\n", + " --manifest ctc_baseline_transcript.json \\\n", + " --custom_vocab_index index.txt \\\n", + " --big_sample spellmapper_asr_customization_en/big_sample.txt \\\n", + " --short2full_name short2full.txt \\\n", + " --output_name spellmapper_input.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ueq_JAPWGs_Y" + }, + "source": [ + "Run the inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zgkqiiZtJjcB" + }, + "outputs": [], + "source": [ + "!python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \\\n", + " pretrained_model=spellmapper_asr_customization_en/training_10m_5ep.nemo \\\n", + " model.max_sequence_len=512 \\\n", + " inference.from_file=spellmapper_input.txt \\\n", + " inference.out_file=spellmapper_output.txt \\\n", + " inference.batch_size=16 \\\n", + " lang=en\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RPQWJX8dFLfX" + }, + "source": [ + "Now we postprocess SpellMapper output and create output corrected manifest." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3eFU515yKvXP" + }, + "outputs": [], + "source": [ + "!python nemo/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \\\n", + " --input_manifest ctc_baseline_transcript.json \\\n", + " --short2full_name short2full.txt \\\n", + " --output_manifest ctc_corrected_transcript.json \\\n", + " --spellmapper_result spellmapper_output.txt \\\n", + " --replace_hyphen_to_space \\\n", + " --field_name pred_text \\\n", + " --ngram_mappings \"\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hRoIhhGh17tp" + }, + "source": [ + "### Calculating WER of corrected transcript." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qIT957bGo9AY" + }, + "outputs": [], + "source": [ + "!python nemo/examples/asr/speech_to_text_eval.py \\\n", + " dataset_manifest=ctc_corrected_transcript.json \\\n", + " only_score_manifest=True\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NYXIPusupqOQ" + }, + "outputs": [], + "source": [ + "test_data = read_manifest(\"ctc_corrected_transcript.json\")\n", + "pred_text = [data['pred_text'] for data in test_data]\n", + "ref_text = [data['pred_text_before_correction'] for data in test_data]\n", + "\n", + "diff_vocab = Counter()\n", + "\n", + "for i in range(len(test_data)):\n", + " ref_sent = \" \" + ref_text[i] + \" \"\n", + " pred_sent = \" \" + pred_text[i] + \" \"\n", + "\n", + " pred_words = pred_sent.strip().split()\n", + " ref_words = ref_sent.strip().split()\n", + "\n", + " for tag, hyp_fragment, ref_fragment, i1, i2, j1, j2 in get_fragments(pred_words, ref_words):\n", + " if tag != \"equal\":\n", + " diff_vocab[(tag, hyp_fragment, ref_fragment)] += 1\n", + "\n", + "sum_ = 0\n", + "print(\"Corrected vs baseline\")\n", + "for k, v in diff_vocab.most_common(1000000):\n", + " sum_ += v\n", + " print(k, v, \"sum=\", sum_)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DJtXlqXbTD6M" + }, + "source": [ + "### Filtering by Dynamic Programming(DP) score\n", + "\n", + "What else can be done?\n", + "Given a fragment and its potential replacement, we can apply **dynamic programming** to find the most probable \"translation\" path between them. We will use the same n-gram mapping vocabulary, because its frequencies give us \"translation probability\" of each n-gram pair. The final path score can be calculated as maximum sum of log probalities of matching n-grams along this path.\n", + "Let's look at an example. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "05Qf9wgHU_UR" + }, + "outputs": [], + "source": [ + "joint_vocab, orig_vocab, misspelled_vocab, max_len = load_ngram_mappings_for_dp(\"spellmapper_asr_customization_en/replacement_vocab_filt.txt\")\n", + "\n", + "fragment = \"and hydrod\"\n", + "replacement = \"anhydride\"\n", + "fragment_spaced = \" \".join(list(fragment.replace(\" \", \"_\")))\n", + "replacement_spaced = \" \".join(list(replacement.replace(\" \", \"_\")))\n", + "path = get_alignment_by_dp(\n", + " replacement_spaced,\n", + " fragment_spaced,\n", + " dp_data=(joint_vocab, orig_vocab, misspelled_vocab, max_len)\n", + ")\n", + "print(\"Dynamic Programming path:\")\n", + "for fragment_ngram, replacement_ngram, score, sum_score, joint_freq, orig_freq, misspelled_freq in path:\n", + " print(\n", + " \"\\t\",\n", + " \"frag=\",\n", + " fragment_ngram,\n", + " \"; repl=\",\n", + " replacement_ngram,\n", + " \"; score=\",\n", + " score,\n", + " \"; sum_score=\",\n", + " sum_score,\n", + " \"; joint_freq=\",\n", + " joint_freq,\n", + " \"; orig_freq=\",\n", + " orig_freq,\n", + " \"; misspelled_freq=\",\n", + " misspelled_freq,\n", + " )\n", + "\n", + "print(\"Final path score is in path[-1][3]: \", path[-1][3])\n", + "print(\"Dynamic programming(DP) score per symbol is final score divided by len(fragment): \", path[-1][3] / (len(fragment)))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hgfKPKckaLnc" + }, + "source": [ + "The idea is that we can skip replacements whose average DP score per symbol is below some predefined minimum, say -1.5.\n", + "Note that dynamic programming works slow because of quadratic complexity, but it allows to get rid of some false positives. Let's apply it on the same test set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UhSXh7ht_JRn" + }, + "outputs": [], + "source": [ + "!python nemo/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \\\n", + " --input_manifest ctc_baseline_transcript.json \\\n", + " --short2full_name short2full.txt \\\n", + " --output_manifest ctc_corrected_transcript_dp.json \\\n", + " --spellmapper_result spellmapper_output.txt \\\n", + " --replace_hyphen_to_space \\\n", + " --field_name pred_text \\\n", + " --use_dp \\\n", + " --ngram_mappings spellmapper_asr_customization_en/replacement_vocab_filt.txt \\\n", + " --min_dp_score_per_symbol -1.5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "u8R5YHB3vPC8" + }, + "outputs": [], + "source": [ + "!python nemo/examples/asr/speech_to_text_eval.py \\\n", + " dataset_manifest=ctc_corrected_transcript_dp.json \\\n", + " only_score_manifest=True" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "upvTbkFAeYtR" + }, + "source": [ + "# Final notes\n", + "1. Our paper...\n", + "\n", + "2. To reproduce evaluation experiments from this paper see these scripts:\n", + " - [test_on_kensho.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", + " - [test_on_userlibri.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", + " - [test_on_spoken_wikipedia.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", + "\n", + "3. To reproduce training see [README.md](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/README.md)\n", + "\n", + "4. Promising future research directions would be:\n", + " - add a simple trainable classifier on top of SpellMapper predictions instead of using multiple thresholds\n", + " - retrain with adding more various false positives to the training data" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "toc_visible": true, + "provenance": [] + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/tutorials/nlp/images/spellmapper_customization_vocabulary.png b/tutorials/nlp/images/spellmapper_customization_vocabulary.png new file mode 100644 index 0000000000000000000000000000000000000000..1ecd7ab5add501b7e2889142a4df67442318b161 GIT binary patch literal 39243 zcmd42WmHw&7e0Cb0g+M>kyNC+4j|niT>{eGDczt*cXvxj!=Y2@?oNS2!yyix|Bb%C zalhP;_ug^W7!3D5?mhQfbImp9T=RKWu!5Wf<}-q4AP@*s@}sB{2!y-}0wIt-MFf6v z!N0Wy{y}h1k`M-!4HNAEH&0B3WQ9PW%1HEEePrMs&F-VR0|#4|vaqG`uwQqVzUk6U{= z)BGz>ungr(3n^#Y`xI)&z$29}OIgv+A`my%y==!xm*yiR?s)~wABBQp29+-opDp=R& zE?yG})c8A!RAg2U)6EEngM$NNI^qqDmZ)d{_h#0@Y4)3;WH!^Hq9VBrUN_hx7vsQe zg?>k`EZOfNBtf;yjYM4nLPFc+=ARQg_4G)7sMy};bEaE^$%cl8Y&MG@81eo|YScLF zF^eGKa}UX$vS^?WN#S)n!AQlAjC4)I?5`m4I#sk`0XZV2%elZeWjS<21UQyIWh(l> zhd37vLQ*)aQ!ovO)3~%=|C8nzy&N{?c{ZhnicP}nbq(gZIh~M~PiF0!pHEptz#`_3 z8+ziqd^RC3Ff}=u#^<40syZ`0U6l;q8quvb8>=>-nEg*Tu68XrxeczzYhRI2TW?RK zV=}lL^3&6o;%Jn+BZvnWk$qct7Hr%2JkIwk4SQN|U_7jQ!-7cEfBWIfI5xqO(%s#S zgo;JP>v}Yt&VxfG=ji0*k(dS~;WAv!9_ zcbdVLTT(MJ?ty8OOXV0^Snz27jt0E;<~gHoYqQ4%OgfQqD2>ZusnLm-BvSLAwn4zEeK8bo1PJv6Ev&2-TfIDFWruL&d5@vEw+jbHx ze9R1-QdmeG6D5`RGWa_%D_?2(_*#mJLdcih_e!}OHY<|KoW^n(1|( z`EVOZetUO^$Dq+^@~}R;B*K8Y_X7omg}({D(_{N5 zH;cG!Jzslke-5(WGRJe+><7a9)zR18P5JJfFL_Q*4lpNu{}am7_(2@rfi2K{vFt%! z>3MZ>yxs#z1|2=Ox3fDsI(irU;|Dnh>c6qgk9DyS*t4Aj_Q>~e3m@UTAWB3AVHbCW z;*DeqJS;XiBqt~T&$g?x=?uYdJ&%3^|GPDmTIaZ<#E1?evGMlyo}Qi##3H7prv6j) zPw=la#?Ud(lRheAV`Hd^)kc5(qZBa)%~~s9yiG2L{O+gJz^DGUQb*aa{A))iZ}0n9 zdHx2^D|n+)G_KbZ*OmZFzm(v z1pFf6Pu8!-hd1FGrLAAqyC;7tOGFU$0m2psfPvF9FvtU5a&Ma9e}g3<7gyHS zmL(~WOYa{>1UYtQ3HjyArshxV03Nj-P|{;_je3LW;o}bk6~}~{8g@6pHInoEr>#z} zZnH9-7WdV5WNGOE$ymzAK@%HVT1u11r*n4!GXJ(OnJfh7n;~~BM4&8X5}(`SpUMRm zH5%dK9YEh68}BKMO}rzb?;G%y#Y}aGNwkEcp1_}Fa1e;B6!U(N#l|118Y*S77hn}Ua)kr8&fGhsPhlFVUE0Icm%Nz;*xr7Gi4 zPQL*}y~ReS;;~t#w%c|Fq&L(km;~VV*`w8t24i5}jw*zK-0E?H^|oh#Y~^~dyzP3I z?!*1Xtw^z2XwH0uIY0=RYE9Qz<(t&tCwLaY6G&VMtslM zn^VBM+V^28Q@?`l{_g8D2fQQT)di`hW@Zu-5{C8c)Q_HDZl>DC#KsB$9V6tlnO#_D zyIc!pYB?SMAsHKhf!|(=a5M_3s7T^=Dhdr9P?qNtGbt`Eo{Jg+Jp5fJj(o-5-+p~3 z0aEp>z)vbFU0q$;^>$wuEdu=gkx@{FGx$<9Q9z}G={y(mKGzdPD#axwCA~grKYsi$ z?1}g?0KBgE?orhs_x5NqiDjP$uXH(1pZnX23g&EuUIvh_^H2(VsI{K=-gN2Ysicfi^s*o-O9t?*u@<6*Mlr==SK2# z1D;g!cKz*x>t?d)tPH4LKu%6BI5?QR63}~55iJJ}ARh(e(K%!_K{q$OkGt^5j}zN& z{d?8{VMp)i>FH;MxK?}~?sj)A{ymNI_i4?zgoMq_&6v&@3h7zW?+97afdAhFRD~ni z+1;HX;FB@*1O)O$06lISw%viWd%nSeO7a#MgTz}fNE{AXs@zJnu)CQ$+KXYA*EMz?2CvY z+~uEMndY?+ukpJyFO4Uq?V>Qg1#C|X$UGYywmu@V5cra?*)H>*e7xUN$t-&eRiLcQ zT~wBofRHd>f!QmY-k)Pt_@0({BUxvam4rKOqZk8{Sm^8WGJqI&nlw?cqbVBwu@e~ih+gsd zC$(G>;CI1?={iGjqJ8`A+8n>YWh+hI!%&1a;~G54%`jntJbNJ&J$JSV(OS+i<(514 z$+rFZi?fomdURyw6ZnXLnu?10=g)&_T(Q9XlN|3uMl{sb)A_wL3Pg+z02A~sVd^of z<{}ifpFNj7ALHS?QrBeN;ZP5mE-0YaiN6Jbm-Q$v0=}jEq(4&R^VWUxzL;D6SXuja^G2Y`&oX59+ zjO9wi_5kZNARUnFdtW%xZRTivb!-*o|MqpG=nCBt-?7$fl!ef{b|q)edL!<{nGm_+ zLD^7@IBv8#(zt^~U98ACikWJ5@;L7jV;kNblyFZqnR>hnt855ZPdniDwxv6^aX|sD zt}2r>xs>6S5xq(S2RX zgcxP673?XJaBIbh;g~l;3tPvikdbe_=hZULwW%F*dsh(6%h%vG6;>CQAHmToB;OI`ryw@Z*`6sPg?W{Lz_~F-rO_WB1}k7Y3I|_EbV>xFg~-an383- zcb`g<(r;chF@997{fuJafhDh$rf+2S`5I$2`bU*ok9BX61H^+@L}d8IzVM{`{B-%Qsw zs?%r`Rm^CysXxWU5jnbG0haKFRSY8Mbvv^OJV%)WG~T&CtX^*4%)=>p2LL zqWKADeN(_j9s5fa5$C~>Clq36d? z1SnQ~>B9;Q4dwB=K22e_Y%DkYXl>)m{2^0rD1zbs{bWL)ryyqFd*Vc6W>Unjy1Nmd zbu|Zm4aI2bXcV(0n;N)>`5PIlxVdv6^k2^n5zXBX?+53PfU&mTLxxx|-)x|1H4nVgjWAdW&P4O?{m3EO}LW;?_He z7ECa|BeYC~od*=nZ04%EQhyr@`8WfC&DYmgg2W996bGPHpe$2=fB)mFTOS||S>$~T zAe>&+B(h6+tc>i&`yUw3d_Sq9S-E~agvcgTEm8}L9N~$YKG;6N!5GiK-HDyWXNdTUvTuGCw7X~ zJu!Rg)yJM`T)B$o{n@=p(l_7_8ZW1~gEqA?1(n5Y=biFF#MW?@t^2hH3Ic%vv+hes zo1GWoZ*aQ(mO%ZBM)K5i({lD_qt87Pg9b~?Z2p`^u0z6x_0jEiB2PVYSH2j>-l%BNu?3V_2@q!Kd;H+Q8hxH{o*m5clA}}s|IBsbMY?Au z2;&K`M<3l2k0c5IyP3PwcpE;p4XbiFJy71gv0;zgGfv2+>ytM=4AGt`h1=i`>m8*n z2Ynv?(XYH=Rac(>?OXF8-;6;-C{wD^5{H$d1HXVJUg|TA3)AQ?p5nGS@}1%w;J9cN ztMfv>#QG!#JHPX?nu6lb;4Y5Ao~)2C(hR$6PYz?Piks}Xf=VIG<3>)eLEIp!XTeA7 zuSWRa2lHXa(W*=rX}=uW*FX8rM$`b^ub>F1$^7tbN6QPB z!FX0}*oE7o;q$)1kS=U%PCRn*+YXl2uDb}8ybje06$nYV1Cu+++|QZhp1A*E}Dgh8x8m@ z`gCjX`exL;DLYg)LxY9Vk;`xRTK$3@Q?~WKi5*a9Y~vhuwJ%~G;uDSHFMOaMA@Tkw z*c&emC7dOz#b&Y;bxDCY2QJt5R{3FUyT@IhrR_@n%Y;@&hG(hf`Fs+{ zdhBV^%g){{!jZKn*`is}RnOGr#4Thl2b6w7qPX0g%XAP{CGq$5wk~o*bHx7j`AA`> z*gD;csD4@HrJ<8?6l`Lxu%dYeA@)eS7CP=GFDuGtsgPp*=%G zWtZNQ5IV-m=|ntOkXA1Puz`7O+umNaBYFe=(hd5#HK`Nl_z@|d0vgwnie3{cr9QCI zhE;iQS2OR;^b_@WuVvRYQ{L|GcpoDha41d=|E_lA zGyG-0&!#2+yN9XTgJihq91=YDt#WDc^G66BD|u=#`!U6s+wS~zNV~iXxwwA-q+V>+7fF@X9PPb+9ftagcVW|JYII{O21E&p91Q z$*oD6s=bKHaqjvuymD$rg(XK>%}FKmx3U$56@| z0+|^?>}Y=1gYukeD1&T%d3+1c+)qcp+vUL7eY}uRvoPX$@lxz`Rto-bX@L)#k{Mi|5*pYFmG2#tQ4%@Y9gZ;!*LkQ6rm2zzup3XXS0wdDxC9Ko8oJCFn!`lgOT-A9 z8sl&tT)a^KE3|@^%7N_$wMCA+g=%$DaH$z~S3S2>$HWHvwa~XPuz6fVAL+2ecj;JM zlP2laV80bpB>wp{J#)b#VM|u~D7n~&^U(26rL`K_)SGUN_Uw8jVS19Ezzbbrtor`8 zj&k9W7@IbJ_QmBl1YAx!KdDtmoN-)of+MFsmmQANEEyyAyWm0v-LcUeLkL0t-RN+MGl$*5Okp{am_` zk~**SeDyhoIFyo5d1+ZPpHtrn^n()0*#^{}t?IA=vZF<6=@WwvM50!;-C=4Dy_~+4 zceum$F#%o7gu#&*j&+2#cu~V*Xr8qc&MIn6yiGx;)>>8{E7?2EtI|F6jChq#l9+!^ zl=Q*I>}yAJ3eykkohdn3j8`)~=a42hOR8|VZ^S7_Q)Q*Y(Jt&?$~j=p6^$x+{(c1` zpY$~4aGt_{ox>SpIAG+H~ z^=XJ{=AcXPazo$5w(;I;UI`jJp4|TRrBv+iZafVOk$CQ1&`c`R&+qhRXhwOD8va8+ z>jPU!a_%J8_ z?7=Br(M{RyS#q_seAP6b6i#jm-3ctoSi|?oqF+kY%#(iFJTbr@u4jo9T}20HjY*Cv zI8d6bsJr+4vRylVw!^eSEyK=%1`ugIexb*UO$d}yrQWp*SC8lXrmQRbZBsQ+fT!VG8h?* zNy|TqS~%~_g%IyieEW^4iM9Y`@L4g{e!lu25MkCs)^>MJn7Ch2UtbS^3grOQ02CGe z7BJKKJe&s-7?uGf=Nhs*F;)4c66*2QlgCbdbS0ijlOe2tdgL^g$1?+ZlEp;!wg{T* z1mLItCCztvkEIgclK$9I4Twu3ig^A8tt^l4p)TWw1f!$9Mne96hfs-@;)dVc8)+!;eZ4OhFt{kI_^7lvCHHhFz)8s0m02ltcHT|5an zky5iYZCYrPx3(@Q&pbj6a0g>UGJWo^0H{Z!^gQKjK4mA_TZ`APUjsY*RrnS33)t`k z+6RE2PwL))uE~Mo;3I^=_UlwIjMCuHpR=BHj#|p4riff%@9jd;i@QsH&GUlm6}29n zmOU%gU&X7&@?t-Hh!{wQHqJ%F zXWy>WkFHHU5<`;g>Atd2`To*tRSZlQ`QbsW`lXhJk_4c|S=>k{{`34}

v%~iH zO23{k0f-+Wq=|m?B=jdUP?+zQc{sNzFkt}_F?Pz`{)>SY{+}I#5CZY zAX0!h|2@xMa3uE#KCKfZ=mEYRl}Hc7TmYCGA0J;^TLZxVj#5;R7t-G@<7S-?=tco!c^v;?d2L0G^d4Aae7Ole*7p5Ncje}V}jv3qj$uj zTM*Pw$-G7u&8Hi5(eP9&9<)|h-loS~iOMr;S+K9=f%>M=X`y7x7@Ta!D9p!%^$NQ1 zQ2NcKE9u5ynT_^NkRiRU8ZpVpNLleD&nK`riEUpxNLD#mLYg*CqAC3v06z(HLRh?3 zb%uS7HfJ!EJd~0HwSeDkz2tgnET>dInauT8m`XSHz+vm_&s$-rf5lcf9~uM9P4%#W z*nvu`TnMf{FlQqF?bdS-o@c}vttFQzF=ijI6C0&CA3THlr!_itx2?Nb1Xk%I;PUy? zAL^3})WFq9%|AsXFzeBG{cf<)AY8IsHI1Ptox?`yR>jFK9?d99OkV|_eT(JnA# z-Bg>~V)qtK0w`6A#jfyerMvXGC{xGNhn~5-Y(%Xq`B|!JloufF4~q^QJe&?|;*3kp zGbry31eNj*8mwEVc)dPvmGq{NQ zPLI6g_WX{vH~GM2&B}2~-5MRfHqT(qwXiRFRO52GRK$vNm0P&5LKoZ8HJ-pGlli?@ zfI9%nZ>JBMSJBdH29R7r%y0yt?4wISng;%^2^z(p{wAS&QG5DI^Y&zb31ELnN=dB& zD2(l2zRxw!Z^v3D{SAD-8^d%1zb4zp82YotB<-TI((>|_EXJXT=%c=no!0Ts{>dJP zr%6c!J+m!&yXasib?4LfjRH8b?n!%aX+tbbLyj)`VtN_0?2DoyEJ;S@LcTBBB@iH%}Yj&B&{r3Cd8!Wb-h2TXH%~8yohJ>8Um>_F^+`(m9tb*^2W@|XC z^-dUtwT$tUV+-dVFyKB}$P%R*H`~EOtwonrG-SrIeVkfY{8ZSQ}WG7Z? zSe%?o5q$I1MC8n0aj-CWSml;;!i|!r>VkKHC$uro6Da;+GH#&EPQ--W4|MB56;Gzf&#uY zfG!5Mup>*oouFAlpTxw(6cnB(qJvtV{RK%bT*-E(dYvfekkO%`FEhC_R`k#Wt; z9@Yb}Zyb3NBa~P3(2@RmOvLS1Tr7?Y8OStMlkdhv^Pl>;-Olj`_f^W|R#=DNIFd-$ z@MJJDQ%DYuM4Z1yOKL&ezcYY8OOEgk+j(`9vtun?+p8ojiil1*@y&KGJ*hf+xpsFY zWV!|M*lPmaMdD`GVc|7HWc&t|)HcOQy*BYPGsh)$b`fA>bIZnxD`slk!$@W7DRe9kx%e!e zzb}7lG5+#%Q|-OEEH|VQINL7Otb0WD00Ngg_Ft!a7`{H?B!dB_K8e|BI!8Vf^+=`9 zd9plqI9iEoZJ*Izee;}2{#cZ;uhK5=k`>L{u3{(uNi95?T3k_OqO#WVf7n{2om${X$aBDVgAb=wS1_9yW~(&dTMUQ-^|ahy`A38FED z{)z3ZXFk^3&nB`7koghl>W$D>Usi4Ks_#D}*(^A>dM}jHKI3A1o?6DoK%El2W5zkK zfc>Geala%boAzU3ESit?Dd9Qg`nO}z)v*_+bo3+CU`#$Yo3L{h@8YBVE}1{UjgG`E zbY8V)+PW}W4C%R2s`(kOCcOPJ_1z^j~%D64}Dq9~fAusCi9 zI*_bH&~$*M8FPsf!L*cGkj}0#nXjloVPnYEQ?tq#uky@{alLgeWY$-_FPOk&zKoHl zxHMg@FIhv3j@*Eaa4Q%OWmGv{%b4H^fz{@jxl=2su^ZrhIPcF2y+WquOM&^E~ZGe8V9F`<;Cw|m3y;y!~) zi||=KjOa9giFk2jaMAmb{f7f`C;8XS9UEp@MXjFNUBSGq9Sdjw1+*m*0A9eO^Qn3f zs@)*4mqACwd8*`Uk&yGlB+1JGUvyJMHEv@xVg}Su7Ia}>tx1qkE1g|PLOK36eYQ2+ z+9kZzP8Gpj!o~)+F_35~gI~4gL$Xx`Y78{%R=~y!g>SLnoFA+MW!s*=ERnnP-Ahsv zZjZGOCi9^LSY6F;67sSyW8Ne;{;fpZ#NA2ja%O54H0=cHyhOI9Yt1Sx|FEbP@F71? z+W}@j;h~t1?qScl3OGP?e*f_G zMujCm-jogLxqqH%8Lo@SAq{bMoX;<;mbi>#S26!kf*mJ9XWVTc90s*HTUhKtkS_#|`JK8{NDAN@}n8t#Bv+&G&1 z*lxdtXARuUO`F_ODtD-prm z;)Ea=DZ=-N($~Q(PFAq=-NQEmZ7>XO5KKOYv?K||3WSD^u!oPE*{`qjlPP<5j{9S) z;{IS?R-lAN>uQ;>-d1fY%-13^F166p_pnaf8d7$M?S0bZJb;c5S!wIMhz@pkm>-5% z$+5`}OY~_eMVkJ$zWJd1<)>IIEkoW^?O8v+a#AZwqJ0eKXxDmqd=kFGF5WH`V^`w} z#G;WHV#O!C{UX0QbzOj-GkAPEj^=pqIV>jYfyLDU;WQ4QR5jMtI-`SQaAG*HX2za@3}$se93nn=J=PZ5B$g6;g@c zW6H#4PVM%-KOuFE4Za zP9iVt0v_I{VxGD!ldc+%bI|nu>vA(S|9rVq3#l*6<|@>_?GC{Ml`ndKF0Y;!8MLzgEa8^9vHI8V97RBGy5FD)tQmamPq z>79vcf<^7O%f^(IiHbnZ^w#9gcNi@Hvzr@jgDV5wDH+SBnrIIzAF;!bTf%0`_x}`GoGEO!6^*(Ky zVBJSzCdYqOBT3KWdldkvY01t0TE@-c(Er@|bykl|`qB0%XS zxozH5|B;1DGtn)**q}1743z=EV41j|=wzP6NhJQ^pBu&Tij0Hl_)^u*L#3EL_2#IB z5GP@{`I`%N+6xi6Nl7JT&Ia&MyfnEPD=W=0l6TMFEC^)O3z5~Km-d2|LQHE)530M` zIUHToyOulOG2I7dC22LEMGW!DC@H7HxaBzBc+-vt$6?-{i1`_2>vU zqF20c!lj)*YiTX+OcczNsIytmgBgnl{*^aybar+R1|#)Bd?o)}BINgQF z0l+#2XuHQIKs6*2V6a4lhf=JdLl?ddd<$HRPxDoGOr;#L-X&hXq&!4}io|-UkZSUl%J zAMFt57jL_Gq>BS|l*pYr7J_tDl_*>s%6M+{oq%<4c5LEbnncN*6!cm@@@$>ja=>AH z8o7RfEg9I}G6hm?(bgQeY!pt=I>dJ90+yOV=9mybwLv{Mw9{jb)yr-MU5TztD2G1T zR$Jxf2Y;!{&i~F>H>_`?cZnmL@QL0L(L5bD8UU=}XV7olXWod-j(*O_0VIAFhqrD+($5JQCX^M42ijMm&_0s&rK^ zg%|yuS4&Zx4`?y-%n{^b)0?+*X!|wKb7<6{K14WG3Lkpt)Xn)xMza?8c}x>LJ_;w? z2Sab|dXgN1`^?a}1jdxE8wgoXq%G><9y&i~<6pN<4v*1M7KAHLvS5Zh`I*~dYXt9` znI6?Vf7wVyvva^PW#m7H*0PP4xNzZSeBTq@dS15bwmAZUS;>XX5S=4idkaLVN%({q zoGftbnCQl4VlzL;DQFu*2KVM6oe7;|oV+zxXXCa*D=OQsq7=XQ=mWef&~~B?`_lW! zToQ>e6!#v(j_sRHfQMKC0gW17>xPNP5}q5_rF?6k+FS^_Yd0|DjA;u>@V@WmZW0z& zQn6lqZ|NM+05Po5Xu9+tAHh6ZcUK{I=~7?K`0~tTQC6h!1e)g)w$)9a$HNo$jwk#x zjeaBJq!Dsc;BC_#3mFU*qU?NxUJDm?FMu*tH^8Ms#uGZrU3oImpI494#3nTM><(Cx zp=pey<%kW!hfV$IG3K08)TcFKS?3n`d`SksQC3_SWh{7B7)6?sX0-NgT;Sz56n)g7 zmCcdpuh4n*N(jr;E9(1!a7<1d8lAzjPh5LC7%O?^PBCb*s(MDR@g&WKhQ~uANX@m_ ziSA5Nu@h+QehL3fMvNuX?%oe2t>E-#i*jji#4riw()LUsfD)!izklvl8K`HszXd0I zcs_nT`wwx_d$(iDl0wJjgK0uYckETGS{T`@#1Rk>5Z$)jdEk#*gUhq6uWH^u>t$?W z8wFomhtvRgyj4vjn2936+s3MYH3S_sr@_=>;@&H$Z5IL+*X(9HK$j zO&0+oEOamH3s8(zKN0uIg&D|=!IWw=Udv)LrNnleYAs0ld3J=jaxAlSF>QpR;|97g zk8EIZc(f!f|5SZ(Xi6C~g@AwjL+@OOr@vUrg-SfOBUoyXUx`w41k18$=*R`LZ#qPH zTzUXQ&R$&~JkwIfn>+g9wKIi)%)|H*p4dRIWxvhzksey(FP4+NL3j6}L6>^64QYc1 z>4?ccLw4Ws18QZpy%OY%x_kz$hWHzggvC~!&!5j4G8qS^QuL4{GD>LwJFw-8P?FhY z1_p=_mCqlE5Vj62_yvy+5>^1wqov_@(CwY@RQUwl(WtOKED{ zbahrzjOEi6!OG8Hf&|N5$4w>pF#Wh6P0>-|A!H0}cBtXe7$T-@)#2GGo*ihyyLx(M zCFRryQhpHglF9;gg%rY-Wt70-X;+;MSN7ncj{*(d{0 zMq^`cEVcI(m+Y5})XjOH^0VIkLR*&Xp8i)EIfM--h$1}*z0tpXCt0EaBm;D^{ z9ssj6xQnVi{qN`I*V5=YaI0XL9uBs7!Tc`>=eCl!yCI*Hz640)yDROl(>-qp;Zuv4 ztXU9wWO52hOsW*&2~66I6vm_W&Pc7ba{iS(;60mIZTmGU+5K_$uGxpJ+$ zNMwXlebO*doB}=@S;AZ|5uH=x4jjKWY?!Pwu~XHPxR@|~G2gmlUNA;$`qZS{ch;V( zTwGskE>woK>F4!>8qPN)R@Hb$eQoUx)C2Iid>mdNWj-?VxV$)~Au! zufucK3S*|SC5-u+4ME$s*SeX%H+KlCP2*1%!;&g+k2gIu%uRU4wJ1_*!JY;Bbg~LS zVauPFI)tj)8e%Mi*Hmm-|3m4OQCB$}^M*6gG;4L2^ZG=iPHmY>N2|(T)gb_rU*S_c zaFC{^w2cUPrb2ABwTlv+hX;H`KNn8cXq)eJy zLf=YU&+|y(oT@x>xTwy&$h4$d4$LAQZ~2i>uYXi zHQQ>!X0!S+t<~;tUhJt&Z8Q_r6X>=B@WI_{h!L0UK7>E7qPg|l%3R+>s#8kd&ox{% znjCY4UD@gJ{PCY<@C=qpY~Q3>xl@hdksD@$0QYjKX^3O36{T(Ir9(EHs#*}*2YB~68e8ZF!lDe6Wo)p}>cp?I=; zr`7w{0h?=^g<6`XCJ++0VwdTvo5s7PuF}KJ9k_$;K)vhnYlz@THL23Z7x&syf%S8# zNo)=KFzH0^LHZ?=JtlePUCU%1!!_)r2JEWjsb}XE3{CaK#7xGU7>>|ia|{!y?@?zx z{}^r5JtIjO&Ojg`xT6)%Rkk__-9nMUl4Ro7Ea!h8o7Wu>fP6*&JT(89p(%U!ozI#h z3`^53KV9`x3PBn6yj+v_)S?Ny`kwHtV%}9~jxm+x-T=Ds0#0s+@EvdG+8p8lN7Ljg zddb;Fsd;&`sm8_p#CWwtUPEWzHS0jZDYDIl0!@*4SB+H_E^ik{2tiNgpbSmvuIGYc zq%?aE;Fw9cq?^a4-($CJ%kpfy%I49#zqm-*Jmg-4MI-(~YKpSJv3m z_sH5gWc)E1(rS;W%$XR*5$E6>Y|C4^bt(i}e(z~S}+;3#Y9LW$i zmKlrd5z~#i=`KTVqekL#F)JIPc)q0Pp5EX$nODH^-DF<8=!`uQCEO?j)>*5I7iL;( zVm?dKAP$rnr zb3Bc!I}by)ZmYkTn%aENTSv{_l`0WzWZe`a3q7QRVP}K(=3bsF$nVpcd+d{IH+&=9 z39(Ne!tCr!WfKUJ-M~w%Q9&&br{RQ;JKm{}_bMB<>n1cV^nkH3ZGt!ziC$bpB&E}| zhJ=7Y=|7b7uerIJBskQhpMYGcms+PpWexa8KCh z5&P&23qu=_qmhl;UWev3qt|&Hx_a@pG`4g^9GvbKsSiSaQqDEG*?KL5hxAiAv{*96(@q}XV@m5G%-^jFreSaIHOfZTRf;}v!#>qD{aQVv4T)QQGJ{ve z0q3HM(bcHFgV%ZpXRrsxbzY8G=8{F0-LPIXv@9C4i{05BmyYI&&Cu%v!)Hj23)IN` z=`}7QS>ZE8nnf<}; zMLv4IqsEe=OASMD+8^bx7<{Y^nrk(N9(879TMePPX(>g_S$fJkYuFRk4LA4HYvA2P zsw7|8n<>F=d$o|$5+cldYE_JZ?b=(l!92`kE#8h<5;y+GdZ+HP*H4oHOPcc#y|y5z zPvnSsam>fq_u|x!CoxjG=R@1KvkTfkH}Vv_M9Jxr(Xo|f`>`Khjs$32T2e$@UxeqS zqPi3|P219PT-12cm@4OdlP26bfq+X^_o_<_{7*dR+tBA-3EA20YCdcC?3TX*W%+#y z!8}Q^Fz#ktuW)i>Ro13_xAiQMWi(6pIp{PkpZhw;d^b}!87!x3YV_l5u&p$or#_m- z;*cdox#s%NQNG)O|N2iu*ZQf7rTFKTmyk;`7Ik-C*1C*=Ne2(Lldlr+UoyO_N{O9a zGD`G)VM$;)pBB#XK2nxAIjd5RC1D;LHE-f8AqK9r!67WUjs}^c=#*F6C*FfOop~ro z!h94)kzYKzyrztHZol$~xoxy?2>HbHeHCb$4j`9PSvP&k{7C)=&LOL2mfZuOF>pRT z6FpsGcmX&c*UME>TH4~(BtcgU9G(gX*lHZV|FYG*UTLdsh~AWa)LRf9BgqnZO;$UQ zs#!sP*x2W zDHAIA;$}j|g%v`C@R02~&ln?fnOJeI=a^w!sU@>)3jV>K>e% zuA7R$l=G9&<4k-3xRSTg^O@3R`4$1t9%Df9@jjlt%DZHGH#$u6m)nG#C zr$jTshe#PI2gW3@Y88?Gi9=}@d2LyB`V_gzywy0Ha>FN7uL3MIJ`E}L<`e8n#gYDR zl9qm(rP{w8=M43>?efW2*q0;^6D5o{Us>IlXTif5hDAdTgMos_N=` zIbUOGxNh)pHR1!Cg9L_h3z|m36M$oh!}_A4o&E820@vGQH5x z>+T7(a>q00a~d7ab*wn3xNTu2q!zHICNA?&P!;_AAr zpAaQj5~Ohw2=4A4v~dj{+`VxN!GgOq?(Ptv(Gc7lcXto&@Llpe@61%qS2g`7)zy73 zee0Zk_FC(=2U(A6h22M`2cwSA)k+*)TZ{{?7^DY%$c$s%#QX3xT3EB#*<8jmgHK2- zq){{Zxd8)49IZwpjwaTr@pxWQ&m{Jh_J>7OO%A<**8M8fbm@eb3do?2u(CUZ!y=^4x^>)dvoH+ai(@MS4u4D0%zi z-f3;Sy4akD*9*J4)pn-1ql$-Si)q5p4JFq(LX_iK_2`WGM#zOmY|NEC4arLK(hY(?MasD=c2mcCJ z2~k0Q{w1)qgcD>nw3Sy@CUV#kyWsP>o+dErKgNDbpAk3#YY>s=r8mLz+|ts65z_jn zem*{JKu*xw*tn~7RQh_dPR4ymnmbFYF3HgHCl3iFnbET=aG&0R{uP}kW zhBt=0Z6;9|r{|?{!iTP;)I}~Je)8#XAE=L1AuEGX_koJHBPR+xP!S`@k=px9iriDgBYshqnHF-OH3| z9n76$&rf$C(sjDyLk%Jh8|>G|$H!4oQGlKAyq|ff@wk`T>#9%ZyN^#(Q3zNC z91l&!FYP^0T!QBpt=RY#W8~VTZH(N6V-o=5#@ZC*54Pggah^j*@GJ z5LQEe&da8f^l|bt6Wi+M{yFBnA3Um^Q3#;sO-hD4?+oZK=AS|y9K4P>c~kzgWYbxj zy+2}_azK(!pK0F8DO$iBSzK;VfpCxoWmr^ftZ|<2)ppuot>aPff=YgOXl(U!RErlS z)FofO+~lZeQpD$2BOi|u(VRf+QWY!v=#(tbFroY9u=qc@SM3|hP;S*z36;%`YbkT; z8eK~Dq!JZSE^zeQp9V$pfinu29880Tm`l9)7~wySdw$PsHvL92oTl8Jol%dl1U+ye z!0GcxQuDZ#BaLc;VDaQgJqWs$&A3|3(Rn8J2-|8!pBzO?<=;zIMCg`0AQHLm7}Dz|(me)38^fxtL=KJUY;J8f2Ey8il03Gje_P z@&@&e(gYiNa>oWCV^e2^u*&tT>s7;DLItH;A)^|HA#TwQbS@XaM^&U`<~3fNoJQ0; zoGq&^gV!k@B%_!EyLj}(BE9UqN>8>dNxH(*&A%#5>R1$cO-Cyqc*737#exs$=@^WD z7dka5i7mhhk5scICg=#|Rb9=_il9Za+hvmiT1~KVPhE1XxPiOhkfh)jX|#9gakwP? zb38-&g>Q=LsXpzDbuj)8f1@(KcXs*pjHiBE5#Ff0a)bT6PrQtXQjA+l^sr9-Ny=yXS3oY8ib??31a@B;wjN?EDQy7wO`f1GQzSs-r zVY$1d!KJwk#HR9w-R3GNXRN({l7glyQ0~5KMeXN>f5E9tCO(=?$n>gzY=-~VXeS?H zrLayK4yM#gD{m*b_AyMA8LUb(z~y##*!(eJuVI?id)K6$;?~T9F*FA=qb>AA0jcPbc7}FfiD(JjfWIYbQAf}fatZSyq4qO)0fXRUZGt-q-A!0c+)C}WtnAP zebpshzuv6Swl*cXr=Gcav3$;cwd_@X;=wJfJ+Pr%Xd zNM`l-{Wx;6J2lJlIYF(DAv*XJ6eqM!n)zc(P^3hOTXR9Y62Ue(NptB_h;6!JfR)A z+hea0idHcTD(eBxzwMW=LVf4@oOZg|#~sElLrsxF>NAFmXn2Kd*VMVl;q^sktQVor z0z3g&MKcmAs!Gu~z=iPGEven;hqlpvJ75G*7-zea{CB6LKY#wr$jF#1Wo%n=`gyMs zo(sw!X`K5D4A{t@X=Iulv*CvXl?-?kU*w~kj`%5&t>0C0CM9Xem35TT`?RN+^9Es2~DgK_pB+;H| z)fWb8SEL8D$~qs$a-bIWs)n~ZjuYkP)zYe|_?0w1?zF(pVi*fD`+0U2Ki)Ab)Pq$` zdPF?|B_b|=rerGkRBe;8mycrIro(UM`g#V?ITXKdC6qB;t-p1Zx2L%@;F?uuzGh%H zfBSS2v#V}5I9@B4tfPqZRULbx)s2#eLK>IbVm8$JIX;ixdHzY>)@p9)W#nA;IFbt!Emo%mDLYPe*xi`m z=wddWp<4?vI(b@;5mgN&L*pl+Ia*J581dZ1AgfNt-8ty zq=@uyl#QWk6M<14Yl==aCA1<1YgBC!VgrG9r|qikVf|hYds#>i(eJB&C0a}TDdEj@ zvN+B<=|c)js-b1g=)FXmkK|lr*Q()F;u($-qr0TNtXZg-RvRV`8Zl~#Rvs~W-NcJ| ztW;lQH#@cVncvNuG(6Ke4#BOZUDlC-G};)$@dH_=qJ=AO=$u)3N3_&x^vY59Q@S7B ze91DZxIAh2$7mu&$J3&>){c92k>B`$06w~i=V4ABO*+VA(EKrhyL$H-9V+5(-ZRsG z%5dEv5=$l)@iJW&XY=D=jJn+9*u8o1xzcW8S5%Yjz25Jn7l`8!RT?kTZC6L%6y`fG zmz$b0)EzIXW-TYE#^^8c%>sFSik*$hxeeYOnDpJh2Y@G!LU#eNQx8a0OTxWl%E}UI|J$hYO z*}G`I`+ApFID!A3c-ns`sgsj8ubW{f$gnh^jaq-}X<%bAj8*K1J$o|t^5X8G znEDE%Bi)NH>uW`pM+{!rqfot((5lU+KgA>d@?PXY2~5F$^TwESct14(q03hy#R$su8Igqn*CBmgtC#M!mYH;NfF9 zyc`a93z&CM?n=;cTEBF$_9WOwFL)jbw)>UxcCW8np8Wah zf1>KKe>XPV7eI&I{_o4Lf<$J2*^HwZmD5D&o1!_Ef;(UHyDFYe)ST7wAD)*J&B7X84Ed zdkMs})i<(f)tl#7G>bRaqY$OtZa@V-mt;F(dSJC?9G&myFh>HPG$}K=9{$Q`p{AlD z4yRTr#%A`vOG9q@+Vj4?=3==mprPhr=`Ze-Zd%vY);3QzO}olypxV@^RNz%E0OfG_ z$mn8u_VL4?xrEgBWIIUja8-m&)i3cFuPl|j(^hb9bS3i@7b)p^LI5OGgJB<*^~?Ad zcWP(bJ5zOEj++FU3F?YV?fAfEM5XE9E9pN;c5hBx(dgXRz|18(@;tvE8!Z2JE&(C} z+p`VJa%*dbI5D1a83TwI{%i6|O1gKhK77W#(a?+(o>5s`CuL7k@^3irD6egbnww_T z3!|`b@fp(l#l-_CZ(okOQL}7Oi$#OzQ2O;+f-c!s{E&Fs=}`4R8Xjp^FZ<(b zLaT;_pcd;gY4ML{DM(=5dt8u@VU2HcjSM}6`8Y$8Pu=NoVPHVmtJ5>TdF?u3Y@kko zNFY1{ll3m|QcN@NO!^VJWM@|?{^{E5;qbv$4GzA)@On@!P>KTSE&uk@oOmY8n;3>e z`(|Dy+&2Y#f=8SgCQyUbaP60C6rWl$H5~s4WKi4+Q*cVT_<2{Nha8ZhFeM&Z9}aY* zro)%Wz>_q!Uzi?tg=D|4b@heL7movf{^XMF!Om4k5yN zrNX~*b-bij{~ndH$=iu>o{;Mxut`3;;P~^{q~O%B08+H=JQ2s{+4>YB zbB9(%mj=TFbA?%2tzLSJ9pjVg8dM&Z9UhQG%Y@g@uRlYZ2LImj?+L2Q#vff&q+5UB zj+MH6*Rr`H{=EnF^s|QDEe-Qxu>|zQnoND zYdoU#oFuX51j12C)Y>xc#mQ1v<{nsjWBkDN@oir_)T0Ar;%ih#lafaoS&E0v-~ZZT zSLm;SFT_k{1ML$i!)Al#lvLtOE&8F5;wdtTt69fxRumMJ8UGs`E0|3y1~~H5kFW#^Vqp?{850RXTwIpI@73Qn=T|k( zc*&1L^=DXe{AhxEs3vV@$0xek1Bw?#Ms7dQGd3|!n3O7y4iczQIkxS+J8s{t5rJ!= z;(=rpqp<$iiKBA*-KLN62g?ToIJ$J}ddGcSm(L-&hhOzs^2LBmG8(8k#W%1}PAQ>% zn8Wugr_OJdr3?!C08WBEF~i>9{8uASQPz!g8UTzM60?-XTiz$B9-prdKD^^qatwhS ztQ`Tp;#G2P+R%7&N^jMb1!+E758UuL-%jQ%t9WBy9{Nu&pghxqY;w{CW|-rdR<&tuqXhIpGs<44f~$*_ zkMk>g*JCJa7Fco$!<$p%ykF z1@ImEwoQ#i=QrG}fL50%nx6E?1YMk#hW+PqiV8Wc1j2Rl8eQ9zSGXM?A8|9@o2PE(IZ&__*M3q0 zz}QdVYLCw|@DkAYpN^h%y#KI3H|8&I^EnC)9Jb`u2mnUillfQw0>2ro*Fq4z5zLZD zotG)-0`jRTu=Sb@yUkcv*q%JmLi+bzCT(>EL_L$5y~*s8&c0rNLlX7hZv?*dzo_3Y zt9Q!qi))b)c^o2>jOS@37uece`*wyLG+!3UG{q367G(bzDCLo1H?QQ=mKueIKb>n8MxVv*-WDBz`9A9Tt_5cbF5esPU)E;RT81Ka><@oRv+z1;aM?Q#5U-4H z)^L#1cgMh=7ZDlXnPi6ASyi1`YrS`DdEvHmP>3Yi$kUBD9cL4z@si;e;7xw#&8cqH z>{kj9zqB_}HD`VqqPztX9Cn&dIaTYf_+I~Wli!{WZ1lu;%8f-L<%r&OX;R-+2TIjY znW3~{vCB;Mb{kAT=e7?``i~ZFDL54Jj~0$<6t}Is25Q=pFaLN^d5z2KR&hM$5AO%@ zk2sEZ%Yo2(33%Qt%ywVR$oM4aGT0Cri{FQd9n=U zp|;1w;hSM8Z21-nE-uvQ!WoV_r3~FFs-q4D+2{JAl_#P!SYj?9tBZxBYiHNN1>G)3 zSI2Q0ql{FGO_+pjU-h`TXQ|qW{aN9v?QdHQlbW{aV*7#Mx;E zRAW0rIGXgUo-6~I^g5U?&+^D78&`BY>=`Cx;DfruxCV}N%z!=ga1bMG={TP;nyqe%F{`(F0s*VzLAEN*|@XxZ}0!JFYi}n18p9wA=xd zbiB_Rq|`8E=Pwk|391X4+*e`0xL^H(i{JDVb$=>TSu6{#q0j+yB4vH|v%)x&Fr@`G zT$SE`VHM$&b;Ru&Ttp+^-AI3J-frOLs-RVbmd^8fAsx0R?|JYwRQDANiT2&MQiSMZ z1W;W8(3lDvvxLEW*68ftEXML^H)xjVZgQ6NBL8_iLK!UtBu(8HPfssCl7V2T{+IIJ+sZYRCu_Pf^Zg=n08$5gi0HCK=Hf*%A{)xug7m1n5-m`v$L z+bUrE(Vk&X3GEW+$}tfk?5*24?}+Vb30MP&=lw>y!_8@oe{_`Ud3t!NClsv~;@68N z`c3K6UctME_>C}lMkY^6C?{&1JockL;RT-8oXIzq;JdieQ@ zZpqC-70#V?cpZ@kgCA$>vxg0x{)SZmo7+;-nmD_8RIE;eAgc;Rb<6;SJ!#`Mw?xd0 zvatF1wMZzGE3N0Suak4&YDwolUZ%Jn((PpZo2MYXz{EOoDr+mYDSVbHY4@EGlBI2G&h`*Ok;7Q)u!~kY-%Q+D*2q3Wg_DvO_T;&((Vdmb?}70T`mH_sgziJ27_I z#Vp~J_E&x@<0P#9A2aw6|6R!gzJ2W{#xugm+v#%)le6a~hZ8hxo$R9O6!3WM%dv{M zgl-<+WHvaE(}QBQRjMkm39aX1jufopn*wh6-ztv%ETpw_+@lDVP~7VAD%^#tJu#WR zvg{Y1txk@ylw_V4-NtBGS;JVbr7R%*&irKw9eq84ApY2?gDXiqaS%QETs3e}}j_4u%q}p-p4=;vAzc zvJUaf0n9}jk#|Xpk#b7+{`jl+8{ZXb0Rxi7!M8%e3i4Eo;F#&5T$f85@T_v+AzVx0 z?C&W6*ot|OFHb<~axt#E0h>V-_eFyPD2bO=3Vg~Q8`1cd??g%*2>E0B4kfTwx4fbG z!IQ{t2)X$hq2nOfr`7nNWKE}sx$#$_KJ-4tdwW}^!>kAQcBNTu96^OQOGUhk6*w>pyy$bmm#Vw zB~RU9pCBGAN*Ql2of9IhR_ALWnPfd>7nIo<#$uM4Baay&Mil5{3oXzUmS(#CJJ=+> z#f*b~=nH7cO7`M;mm^8M$}dfFYt4)mU$fy4EgRwR0DLIdNn7u}g6WW*f zj1M4k%{<6Ut(DlSS*vjg!q$@hy~!_KK?B18c%zD7iSh14B$v9{>5K#miudP-nojTe zCi&ki5X*z!I32(MzQZW`MCuW!6oG%xVdb;O!UM~3CJU}RP8dzUq9I|5pyqT#UEiGL zk}S;NYB|%b32^bw9^A9#l4y&v@8yOhBimt^642xQ5@2##=(kmhmhx=P797>g+qS3N z-w2@!N?hf-S>XK{g5NQ>u36mT91J}~AlXYu7(TbuA!zK}kgMILBX3lT_~mHVzYL?z z^r06q6~V)x<-b_W=R6ylH!yq08cw*LguTN;EiZX6HF#AY{Pqy^>Catftx9v=O5+FY?JcoJhd7d>=OOC#B!hmd!piBkPkYUcy6gO(}Tm zgEXNArD)Nul#KlQp{s!=v=b34y~@bd{s)uXqQnms1hRkJh!{!ZRpH!lWeO&W-l15e zA`|-cXtAd)XCLg{Tal0RSL#z3!A`3$n|iia6ZMDh{u1xuM649=v%}7tBLRCYO#LUW zp|)s9OY~|bODxo^er*2`_%*YwjR6v?D&gsMq!$F45%x9%kAWQAaqtmQZ7yzz`h-6$ z=FNPx6G0w57NK}vP>G^dToA~=9r!&XmHSSZ7vkPLBs40T)_lOdnmrJ5^$&qMAV3w+8=={t2x6aiT_Rwxq^G8D&jbhbqcN zNK`3iMP*s`0=sQWn#LO%y7jc#Iv+$>-I7^@w2nR^`5jaQxe*xyS)E{f5vkouKLcEp zH6(@)C_j__P(@?cT6{;3UYe3}jzyWKA+(juenb$>A9~>+JCoJXngiiA4UhUU7IHb*=<_y1p91H-`wg@f8c^fVY5IHNAi}K4!encW)+6NxlRw- zNpM~n#y{wfxn9Z)y2MLgWUG{$@d7>bliw=sOfS?hB0UqR#dXc{FBRO~xoX3nrZ7>q zvujB_LOimkexA+DAjMXGp?|6sy|w-}E4ailyQ_VSqjP5{s-A}1_1zs(3>k0#*;Ni> zfWM1X|3;iLR<3Wvcf}b1n2XRh^@EHW&bkpXQTJ)DSKZ}wOYX6Ma++(;DSf9v zL5A=E^$k#<4pVf{pI`4}apj5F&AqyIEu$&yL17g_Z`g+8`&Z(1vRgmO(*i^-pt zYC{@-tqyGWk6`(1zH}{k!?U2nHg4+dF&x>KQ(HXOxakptQ~U#Jz+!Ou=SUuUU&s=# z9NZ;4G8V;Z9gDzVM%O&?38N zDAwnw_S-$6run8ZP0v=#&=IV)?&(lERp{~X^-bjmTQ8&Pa9-87!^dBymocsrJFwZE zpOOX*_J(Q61Px@pJfR^E60Yr2z=~K0h$Nzs`86)NUiBnOUUC0ntQ6swZ1AjCN!O4_ z{iaQ#ICi5?ndXQ+^McR>=1gPa+~YtKV|l8Ai9|aikt| zYVbSUpiD=juA4xzs1ijUus4TAfCUnaE_=<3J**B2r#knLIvlW8*zpjvul!FE=Ka z#7VTTB|#r`v(qJxOC(YTBkQdm(hXMSOFtQ~m}}2JDq1*GAjnW{R$97~&$l_CKr+pQ zT834p)}L#k`6X+gJ|6s@#|w~G;cX(4p734%s`aPDZjdI=`lF;__|lo<$YL@#FB&QJ z_|i$;ldG~uOM>tMN1Gl>ns6xb$`n9I2UT^)ml!llgkl4313qP%p{d9$hA(C}H>5R6 z&BV0=+EUY!7D;$wh(>S{o1*iW1nsdD`e{{x(%ny0B`{?BT;?nfV7*UpuUOo=k;nVn zTwHud*Av}yY*6tqddoXlkuu~ z4v(!pRhUedbPFIfkUR$Q+j5_oP^4zX#6Wx1Yx#C9PW%LY?CjTa^{jILMRed1E>4Kh zIKI=Uy*{uWTG~i#q?PKT5j!w zW7TG9YDt}&M^5*Xa4uWexs6ai_vA%x1!)j2`*38c3FX0YKtYpKaYRCc%}OAMNf%U~ z${d|>B|5&KTVuv6uh`1MeiB=)ImJrS)f1BIvZAmnIqbUk8UrKtcJlJJO0*>l5zEit z9{`?$e)4$(bue0dAfJTB)`5@xf-Cvv`;^wIVDZHRa`=WXw``3DMQpRw{oH;otP6hy z$>Q>*PD-4f6F(c~DgVlZPC?^UlpNXe_D8mSn!YeZ5qWpnb940}g-ToFY9t-lwCdK{ z;GWXXns{v|R$2H&Wo;MzK3;p$ z)zkk&=1A>DZz3BPInX$E=1`*xn#U1wBEqkv%knR1BmN^vNjD^;Nx7`0KoNu)1diG7 z{S2*Vu#J>2ZXfB0E}f8k@zZRpeQDs`=s@nL_uN_M{d);=x9h|{>MY0rk+C$>Mx#0{ zHg3VRqSMY(PA|3o4{r}IF$+toNr_Y>XxEs(IQMe~hZg$?YsDXN*K$GjP7n0I;2Sh8 z)pj?|8Cn$C*&I0~G;*$Ev?egSIUyXaf=pZkhj_F6kcckr)`}aA&#~ z_jG*Z%~Z6jYiSr|$xyUW2d8m8{?gBGUb|j?-#5DU%6f*si?p5N=97p=q9GIc+_INW zd35@0-H)5?1&)gU@?4Wg5|6)2LV<{IRg49(?4VJnB?AL@bzQSIfqv=xgRjlq+{EiP zFRCVke(whg(sjCd0QRmXitmKo>6p!6iVIB#)AQU9!zSKCj!ac;VvEeSM81E;AMV0x z)tnwub-+w>7mRcYay(f=@Y*z+;>+JK8qz8_>A3g14g@=hpXy!{XVbnrpEL82AxDc2 z(ksQCtz6g>Nqh=6A3EF4*#YX1j_KL=!c8StF*RiD{)F01jU*oDz06@x^I( zofWhf{<@11-~?CHV7D@mcKjk8G~-g&uK5CM)ZWC@vMZ!Lw)W7bZ)-j!UDCLoQ#0Ng zcV!uTd*A*!n9sb7`BAH50J~9$?yP@G-bix%PK1GJz;#IsdHMH|8~Q^trA~5{dlXEj ztlld_=aFVXngM|e{QL+d=uYpLPrRF!B6ur|UP7=HSd;yIzrO}518o{G*dN54m3c4v zid%{Cp_>tK?Mvn^QjSyOv3-e!;4SSFzDe#q2EB0IPL7&Ews2cWL9mkih`1OT#S7P zBH`CpW&FDRK~18pK3TC}3V7nz2&wU(N0plUkNqc?jk!dSHFS4R(yOIxognZqPG z3yG;J!YDYeD4cDJ63(_|{SNYLqpBNpw`$+JUGw=_G|X}W!91U7*-KWd zXND63JEwry5wF77+1U=sc81JrI1k0&QlSMn$J~QP-C9%#4-d`QU5*VhqFFfj9>i=v zBP*h82y(>$ZoaZqvo)=n)$X740I27O_&4nYaos)t45bzOSrrH312o` zl-HXU7S}>iS;bH<`s`8exO*bI*iKgtH}uJk4CfGRcN#ttjJN)FO9dYEw{f-d<48+SF)CP@fapZ# z2oh%HR;*s`(L2(ztuFyg8-I)fjEE>@!jH`t>NfxG4OZT;G=D(-1OAFQ8t-dOt4)~V z`Q;-R#frC4jy!mvN-I|DeDQQ^ow)P@%WWwA!eHl_Ez-cfsG;tDym|(o)C6es85ljA zL0b@zyn7u3X4DxtdE5S*MpL@B$Jzf6jHZzB-Ovnw(_VL&N&wEXe%I+c{UGBeo*W@9 zaqSNkq4ykG-HMgey_Leoi$x_PpCwkF0(OKiZIz7l(0bzW=)R}lQdM2tf1JgAoah3> zu1e+dv?Z;pzRD5+1@B@_E+zO7dnm@yt7493Q!{JN-D>v*<4{NDTK-m~Rs0e)Y*C!*f?ZIjB^+C~xE&Hx54*UQt0DSG#@F>_~Jz3%d zCK-~5hI}=Cdi~mRc+hgaH^lIux#Eh-@|jk6j)caEn)x{7A2xo?@cAsOgLfk<6@rfB}X>gbVeR;UzJSF`ScSzLs$6-@?l_w z5Fcs87k=M;j6-tKwgtt<0t@Z?9kQ-|GXUvbB*|f57XNIzp7c!9FM4}9lI(R#T<2O^ zUOuHKzg{ww{+0be@2N|11)t#qX&{gJsG#E4k}hbDX1$rO`lxLr&hpNEzGbs3w5>nrS{) zsRGI24vwR{OVAz#x;;BZD&`Yp3@^a>Ya3@k;D;?~v6sGH@=A5!l}XLiW7z-ch*<%V z8M_#xxoB!pgM%&WNla~6!ugertu21j4shk5$bn>N)ds%aru+N(^zK_fGhw z`FRDvm;}A};45D2G*cr^k@wZ6V!E8FZ`I6-)4*jI&!1dZWH_QDbjgjqbBL7C(cstQ zO8}EGgJyU6Rjo|fTT)F4l&a6Z?py96TG@+f7W34ysu=ClE}jt72Bns6`Dg~@1hB1X ztDq4bZWxG+YcAXidhf3Ycbc>HW-^gq_pxtkH{XptXWvBpt?{#RUgThK8ZoxSt#<65 znMn~}gS5!NiluZMv>sPov2jlHA+1FJ$a)s0*;Q>&RyzN{c$xX?eI#*3j z0~p0i{Ti3x!=iikIa`%xS;f|pArs#$f zn#;%jiD!1D@Ljp}er9!CfeH;P%~?SgMd9}UIMaVk?5(k0$Hs3uw>!e*F)EemkJE@-5&huYh9D>m8|{A#c{>`m*t zV^$5xW68UHaIZeaYQWvklsZUtgjt3B$!b*Q@AE)nz5@con@wz~^||LdZjA~$^m%qS ztvRsJG0lKR5_7Y2S3n?EAyCWt!ke0D#%_^sro6sBml~|d-1E|W0`6tl$9GJ2LGTk# zC>#@K@A~N$DjAjqg(WU|Y<{ngGBlf~nRT3+IR^_;!O!F5qaiJ8o3R3QA&L~H*2xJ? z;R?eA1bj{i2HENgV_T3|VqYbRf!^C4ge?%Hkj1*_GQyFkntn{4TBk@RM?~y?@#6A@R$Yq@9I2*H8A))iw zc-7xhBa5>~E2^$|ynSVg&oUaWllIXC%Shf;t14bz4wi#bZfgmML_t%Dg^^zP2lb!S z#uYSEm;#I6rYV=cn+on=?lgTvJfETyLcf7^f%NFLY4UOR2cjev-4P%Ea@!$zd7#u3 zu)N4CdOQ}V5lk=7piG*~_SJw&objEOzQlaS|Acm*(TA10k{TU}=~bRO8(8ti7c?~T zZsWragCy17nq;9#AyVhXF!lw6QH&DV>UNtv3XQ7B_-I(oe`q%ngNQ?2_?cGFLaW5oOT9kHko3vAB48bBl zTUq)Yg;KH_ErfZ&o^z>%;&({$ciS$QVYd_b9UyVZxRt&*_~JAuGjxvtGzzi_s|Ww#08C;WnZ8A6q{Ov@=FCf}h8odqP%Zk+y5r z6|r89?p_AY+&ooHbr4N1QHkFeenEFv$S{*iOgf#1mblZ)ubGyfeaS~DMyG^tG$F4; z;cl4#l%9=V!0gf|u3ZC3?89(GV8aLiy5{4Zbujcb4tj>o8;Q?~BtnZ}{t<{GcHCA?A9B1Cl+ zg-}A4Zu0I|f6bDtYY^Dhc~wV?&s)lHeNu`VHW&Ub*hk!DqYlaZQ{z-Z$+g0sbjL2H6BuwUDq*Wi8 z3oZBS-jq!>P*?Eqhg!?+1*%4_baq`$V`Dg3u5hVZ)>FU*U^U$&?P|osd$n9+3t~VMm{d}? zx&0|nI`s8g)?K`lwxJeA->)V`)U?uV?r+M$1* zhf3D?q*9fZh$?@emVtY$)$u{WLYc55#-@_}s0zx;M)rJ_BN00>qhgv%GCLDuoyYzo_hnm}N!OnPE&H{AK0E|w zOP`>cIQCG`kQ@Jq-Jf>dPDR5*%$>7}>HA(@>`Ji2_>ncnxEc7l8NO8l)?w+T5izY$gbN!=X>H9u8! zdyY^)oVLSOReJt@ezWY`Vtt=Q^VgRyYbv{*TtNrAQz{IRDf?x>L}+22o4u;Fqkfzk znRibFHp>}tH8e&|xPcpkfLg}{tUc`WI%k9%wmT)QPDA4&>s{?@6WlfC1A9%EIN6>4PO-){?r5nRFVCx&In=y%VaajIUJ1X>wx7Y%{uIaODUB5dGb`$~ z-TKCGQGaarn7}vvo_Urp-3j4$Q0ri0aCn^SJ@^I~^S8uzdp>KuHrn?4x88=-_4~(} zt*+09&R!f3fZ3hu9w;oo=t~f4^`{pvIkyu zG_JVr*3_&=hKCdEy`$K_&`-Kw>;Xb^zh{)OWs={_T28Ud~;bNu_<#*iH}5a)jDT6 zGT@8f2HY_Pe4T0HPH(bw)dg@z8B)&diX8X}MJADW6pcAto%tbYjER8nG8~8bF|NEb zUi3IAqzf2Gyba-c4vgh};yNYXWm>egA+u8g5zgyTaLXz+-WW{c3?rny0kTb{?*El- z(xlOwx%|b50Z94$Iu)zztfK_iUuzq@4#*c7>}#zinG_WlS(R)k(Px4eZ7IV_&!e^C zdsyHujeLcAjDyfr%|;`4G?$`^>(r7ANJ};Dev;!&!}TC*HZJRS><1>!pS(vH^LgPA z1R)kU8s%!qkA#g@=<9{(_x$xBZOhOJ4i)obqC58hQ!AkB0h13z0s zsVF)k-7}lFuJ4ZL+2{^WdY))KSEt&P&bQ8|&(b~zJ2f5#=GX#r*T13aB}oti14Kgb zMRa)BSx$=%1-^N&cZT9&VeOxOLJ*KANsVhZ5+vZP*JT%>O>Isf{8Q5*m&Zw|fV89j z7zHA1>@-xU7ggI6^s3jGbHQsWsgqjp{dK0;6RP=pR%F)v)XBMp1=9mU!4p46&I8)& zzqR5UG~=6(+tFNml}9yQhc@r^$F?lT@@&{`(93c-)$H!27u?G0V#(jEUTi_NeSp6` zyj=epKHa=uv>~)m+K~LmNDx7-%0uz8XshR_do9>uPX;2L0~0~g;k9CO9Ia_R=|Ekj z*!jSlIqIuSpKuO5|3qK7i$7Vw;g5cE%d@UjH<)y37rs{dzsZh4?Rmt zD)ag4;++kBGdVIp5iEP=yWe+RCKeV6vYJ4vBRzV`C<+**tZS_M6`R=z*`@LR&H*@} zxh?>4#pzb|r!#EM&CFVA{;y9H<_ZIVJ-9*pu;nWd!=38`RN!#H0@5E?#jjmo75BdX z)@0M2-Dl8b>nKSZPu5hXX`{m7$ER6dhS?dTB)4NfuGaphB!UgQT7z3eyW4oum@lz;+f-xMM$38NNn0D4*=a3h4@ct@N!nfh z^X6)>ARDQtlv60VVY&Tvo{GnA0xqNJKELl;KA1mBI(dM{>7310NqJtcp4~8L$)=LmUEyIsb}KYV^lb_(_zJ& zU&ETcFHplcRnfU7t}z)B%*Jh^EWt||u z1%@%x9d)iaMzP*I1S<;;ul<%=WmCE$?ea`BI#BEcS_-#=a>zvvg2WO=^stkQ)w|~S zbGn%=UP0%gnk{kKj#3wKVvisVA8f8kLEH8lAH!%rsn7cFs|JtuqbB#arZab3T}Le1 z6O;oD4u@V08fJ#-7h}=eQ}e9gu}(TquI3tHU9wr*C!hKL8V?QwIX6ywFB6ly&?sO! zTbvROLEc+@ge1Zk2Och8p_ZOCf}bF=@4VGN`$jUv8mfbqv26s|3!= zCDi~2E%S@C^hNL>4ejiXKE5*~jQn-l>qy-W>Hla8M75Nm z>$mCcJPyll@7tu+=zh%Kq=vKVWYMVLooee7#6%L*3#6B*P|(d2njK4wa`Fsc3pr>P zr`?q&zV};!%!^Nch7$8Szp1U$abBxk^jwWfiBC?x9aq)4y1cA|!5-&*@Avli(@;~N z0AOMUxmzF?s#r9xtE(%N5dqAbPPN+^EAYBMNkc_NHJ>P6{x4JaP1LTw0(-U2ZW5$2 zzXWWbXGF!LZK_OU`s6=BoANWke*#G|VDRcjLp&R~9Cp+u)YC3tC|Xm6KU)&3zSF$n zh(btMxM%fXzF~U$73AT}VltDcXRZvkmKOL2QI0`@gf6blw zKikcW^VrmdxDidsu8jfUx>F0FOiixx>xYDzWs zQ2Q=TF{&uBM#WM~go+^dF?a60|HA$Ll90TTe9w8#bDnd)-_PfrD_arlV0BgkdcNxD%(#HA-mSm z(P2Y0^YQX}{3%gr{~%Hy*0Y>lh9)t}%ZSR0j^+YACn%caBzg6qtFs%&P2GXvUc4OY z#??KMWZu_&7TLOSf?~Z@HiRJzKWOo2iBsUUVz=yad&oy55hW9#4RXP%c6E3nUSNV| z+~Q4hx;^HHsEq&=h_4{<47r79Q1^3$4X#OyNh-&mE04grlM7`(WQB=JKS4g>TvWAD zsQa4cJ3<}_W1f#p-^o(nk&(B5=3NtM;Asw3hUwFctFrXr+4eah<+GHfb(~B*|I7wO zVTMuQBbp#W_tJ$6x4CQ_8jZd>7{0IZ5lXBOYWZQbh5I-B{8*J{rE zJigkn>z74*kf+n>hvh!}nLi4|UG?M5*1zX99{b}@#BZ0um|H_z8)fL-51f0drYxy{ zWKjwN!ew%DlG)wz2xTfOEZpQl3Ne6-MO|kqm8>i`w!YQMME|62lh$ACjs#5tsoZa* z2K-dU;sM~>V~eHZ>Jc~}pK~Y;4UO5Enf`-NAouq6x_4%%iUDoxRA`BEpD%n|wGa^L zhyOGu1$JCVBqrV>X)KxqtA6+?pIfvT9b}Q9*=j? z=uY6{nWvx7@7;5057690{Pl235DE+5H%JFakgd)@!_{ZDdJ_3~2Wg4#K>u|p{-wHaaWTU5OBdxTGm#)-;oNBDci_N* zfy=BQPzy0B;%?6!@Afg7%o6W`ACpdgWVB2ifaM}hH>lqgAF*j9#V zn7+IujB>^ptVj4Eqx@5X&pd>)cZy3VwEnrT-0qn$K4pBC*p^jXtY<7v*cIkT*j$I% z9M6Dh!xlxv&-?&lnV?ObK@M+YthTs#=NluW-Ob-$cQ9JhUO29^^N=tU-@zAe0gopa8Ph?hOUcX z@=ikgE4!>VHv$37!>i+BlUJr?C8cC#J>1>xuU>tuD_LjB+IC5hlvItL$_V>Fr^nJC zM$t1*JZ>|1Xq7ck=5qid&GF_)4S`wdzp=Ko&5DM2^$+LsMt{zQPlHIi}o#@YyfDIB77gov(Q|3XZZ*g3R{{3Vol*Yox2cG(i$EZ0o6@odL5&&sHF&E@m_Sb!Q7F_a zeA?JAl@WV>n-3lz>TI=TWS9{u)6*5;oF#~z>hU1*yAHx6O5si_2j{Lhq*P{V5t4%T z8wU6UjYb0ux%#>1O*N^oJ5KBD?4;9GqxoDS-z98MRFquU$ghr$gw?r$R6X}3za55( z2vpZI4f`@*#_7F4gYvM&jBd_{)E-Xm$^F3JV5ZdDE{gtZ|4;}Cy^0rjqt8>d-)qv zK=pV$a26q(u=;f{1c(MnqN1X|*4#CEA%-;a^IMoAMr5dfXU}a+<-9(^gp`w$v$htR z6bc3x-rnB*XJzXg92_9)o!y(d6CJ^QMi*j9^E-UjwxX(ZsyTZH+jg&OuI@adqXW7v z8A{>y2sa*7-DVMb&Gl0{&3JRs`_!MFo`f*F@gc2U3%ux6^z@dY+29(%c2~9asw9io z-Std23bcCN3%tQykrt41d_hCyuM04|;uo8` z+A^dwkeMg?ZZe+8>0K(g(u^~AUSa-jo5b@PpUx$s(7p$g#ZBjdI6w?vpMLAoC@3V< zemI#|R|$diMBd=E%3c77aMzY5nBq5JE05{etR?hY_?!Ivx!-Ngj36-p4bi^PDONFv zY!IMsy*| zLxQ0x30~6;4HhfS#M;`WS(5{(Wbwtit(wg!9gT=XV(cZ08q&a7*2E4$0V)X-w(5QH zVz0Q)yShmWn+@J^E38xDO!j!)Jt1ad_Q0(>cDcf&k-WS- zE>F4j6uq+<5$uLPxPj_^7#zJ%h z_JPpwPz-+PnnP8v52G>&99N`l$&o1eW3DUNOC z*u8Rc*4l4!Y|^2izM?ITZ-OKX<6jNN>28m~Qar}05vYrp&_$&?*riYtdwY8k5s{KC zA(o6b)aXDi2cMV3lG4%@$X{t`Y5gY<3QI6C;&8Y`KY^q&_xBm5FC2Fu)NIm@*?my* zM5D`rw1iYNOLN?TEG{Y0XFF~*y*kM!wz?JPX4)JY%8k*hjF@7)8{p6&xTwAi%uw9Q zd8qX9Zf+C(6gMC!x!>h)ej6sj1ohM(1V}1#Gk~`)H;^jnul{YcH-HlR z2TUVa@;BD#Nf_-F#vuz1wc*t@H5n#Jyu*e)_4Q-iDdNIB-~* z#q#j;!@>r|$mZ_yPhDb6>CBRnl6UVmz^jrOcb~RX_qdr$TlISW<>IvTbh}1ZL{;e0 zb`#!Vwn+sj(sAbYp&8wJ6OJ6J>^U#*VsWJYly={wO&bW^}GSxJ%H$2s+!78#WUxg1;tGeXD3G&EkfDn_KiKkL$sLuRJs zmEPB`<1gSVkWg*2Slw9#R+nQB8U%cNWoH?+>QL^y`M2g0w(oRD$!c2gxP2U7`0JLa zzn+u6{YQBV+wHu+1$poaivSht-^YJtv+y@war-W%e;?JrR{s0x{{+APe;;ocv(6tH XKcsO>wuQ9)bFEL<{Pf1++Wmh49aE4n literal 0 HcmV?d00001 diff --git a/tutorials/nlp/images/spellmapper_data_preparation.png b/tutorials/nlp/images/spellmapper_data_preparation.png new file mode 100644 index 0000000000000000000000000000000000000000..24df8a8e0525ad6deaf7156fd2f5dc79f9ec9a39 GIT binary patch literal 75265 zcmY(qb9h|c_Xip_wj0}4W7}4fG`4Nq$;3&6##V)j2YW{;D@q~3e}xAF14ELL7FPuWgSZ9*18;FTlk+MVsTeP6~WoA|`s_&!;QWT7SXRLpc^ zh+?H!R8{}kdcL)wd!tp!S0VI#9mbCo>Toh&&YSV>yfrY z(0Ps+*NkwBB34IXLI`c&G&a3k#qk zv|K&vJ8OPpAnpIc_`m3obt32LHNGe-!j+^G;?|NSf~#$)vHcV9!` zBV#Fv4((1lY&^x68Q(m^I zJA4=v6UrJD`B1gEVsfRp;$-0Hj%$QO3+gE%& z+Q|uSwkAspm4tfL2{aWnm4$$WtS6W+^4yDMBigiBME~UjjGWqFx`R;7nkx$@V=gNe z0vzkDqGrqCY$cQ1Z0zcZdPK0%8yVRw##*#IOtBiZWgXT^c;bpzy8^2 zxINYB4|VnCvmDxlsr|e8fr|57R+JWt_`fp@hVFK6@C#ONxw6n^x^Odd@yb>}p0}Fv zr-okS5Lm)mrc;e*#>$8fX>O}a7PdLKVaMdL?)oKE)jJby-_K55tB}T~syYO7If=(W;jq!!Oj$eGWrAl0m&{cqb)RJIue+fzfB*aJ zD>?ad|EotbX1eF&{rQ-h9_qqU*VO~Iz;BMnp&@aMc~w8M5T^#z-Mxt87<!&UgJq#^y^N)a@hKfA;6pZ;!|0NqZq|Zoa;g}9N^;WG)A{;{RQl`|uxim$r?mf0 zmF*|#2PD~WAKQm~5=mgh zlD=Z7n5qELS@lnA*A}mVKqN_%>H4euf|_Q6bgWvc+KxbIhc_n`_qn;v54susC0ME3 z72y6vXK4!ipOt+-;)4iVx70;c(r-+EXU}~sHu8+K)iGtCS)U`~Y6=RQY|Y)5Mo~ZG z5}5p9Kl;)kdt#!Zb*%8E<)o8;>A4>ZT_EF$@-dyVlttBZ!sq== zJ;leXsw}Eus*RjD?Fhgl%jXZV)g#SnE7O*AUz$0&*>@l=Y(Z0(gnbU@;#vOo7E*g< z@6j$9yn-QX4VEsldpO#jG1s6a;D?a|yx7R83tdTZASU#?2NNl(m~UXqd-k`rwvB`B z;v=hDW^8|pOsYkc<3$$bQCU~&4X`%1jf-KS=fkJAdC67l(#rSn{9&yrd1*cJF=tfv z;_sFnXSp&ht*O4#i<#4|mUq3Z1qvK(SJi>$ znObiCjxhz@8tSGM2g~%E?xS2*(cS{a{eX zEV=7958yeUwt34L=TZ;3y;x++-4)*3*D21&EALL`Za=nb!c3g+qvst7)~1%bL=S$` zi(&`P3=+>zTdj(XQ3{>X(J|M)?vT~c7&2f>u7p9vxclQMZ)}9|HH>%KW7m1tMB?Qn z6ckhf{!dRY=fJu8Ta~%`Or7BAka~_oc~q;#A_GauBT%YqI`FprSh6fw+N1DAezz*O z%cme;uC-UYo7^i7lbI?egxQyr?}}XN>x;F$5B~D2+sv;F^?Oj;=Jgq0yE&)W2`+lH zd1tDX9g-L2(=np;+1Ds%W5>Px%`@B0{9Flr9T_Y13I=+ip%yo3--j1q&Cr09q^72; zb753C(`o!?EL;SOYP#0M^mx$L0CRBEsGT791A$3d#qe{x)Ldhlz=tAlZw#m!=KGno zh>JgHaG1BA#p%}xX#)}oV+WP+Q|k)vXYta37;*+@XPNn2V_ZBPbweu(3Cz1kjVod>GKs#gR7Hp)qD%o z$u!cFSHAAKxW$wdyt-?JPmYshX=EZt7sDgLl{JkTkj~iON~=0ngP)L1rfta!`o7EM zbwiADU2!yfaX;OWLuap6aHA7>dQm&nb7BD^R+}}F)G`9zVFB}LTl)-64>B~f%F0Bm zW!=$LZv)w*^sjJ?d=*2|^9d))t~zxE<@v$HLR|Ut&6wX>t-j}{t^Jk=S;t^%{+_gv z$BIT+ZF_#Q*hh%UJw;{#2Dl&UW?Dk3sI)$#D|f548qJVJ;f1Te+apHxeeR{lu&7=2 zxDa)=t|;hnk64Nef4t^5*#IlG8jNwDk4W+5ZE84QuAVBb?WyDGUm8oPvh=TL*sL_Rharv`-ZFt8zT|Cx5;H z!Wh*BVZ(K71cEz|b%hvBnl6y6;81Q#-q7Wt@&awqbQiA(STkui}m>x=C^yzl#b_6qp5K_a6m zOf7cAtjyF{pT-$6;nr{mX%XV|+U_#&%pu49d{Pe+KOLi8^xJ(sKwY_l&v1J;KW$qe z%myrLWNQ{357};IL|zSL&hc<3G?^Z%fKkCl?2E<|hOWCh9OpefSMu*O)wErXKPxPn z%cS(%Vm)g7(o8uH&wPjpt`sX^Zr9V z*y*VoA@k^q$H+VFY55Lt{42L$2LPxY|Lz~+q_B*(VM~LRgs@B`pVjy!pqW!+6zRPE zx6{KATKB7m!+eWZhE}&{Xd7G?gL})XU{t?-I+subsX=tb01Tj z_OChuRwH(Z6MO~p`-KUbPR{P4DFlTF*^k!>=Q~E>ZgZV;pjYBNoYT__|hV zOMHzIvIN7PsY{CU(Hb~esw=fZ9$fNy9AVM-_9&3Gy{?;5?>zw)E8}sN@_BLEFv%hWcr_NV&o`)C#~_%LnBTuUb+TeSpK`D>#6$-_X!luOfY`J*#jmz{I8t zZuh{+|9Snk`RFb+1JikBY3Ab_xqrwSBel{Tk_$`{lPF2Nv!Sm@R-_fbG>9la_?`$Z zXazTZW-ZUn$w!T^n`nyw%d{Do5&4+Fbin5xl1-5`=}S>m%D+E>9Ej1KpF4Pum)*m# zlAvmL!kPtK5E~008vQXSDi@#+R7ohU$=;ss7Z*EzNnqyWa4pb9h0^FQ+9z>V)~~`J zAfsifsV-ApRkB=F(b%$r-w~uwOG^yK9tls%!uIh|O;xkBwyiF$D(LH8+nZ0`hVngs z@qsSY!_Z8$>}p~uKV`Fn6zyf$=>2tFq*>BD4)NWzoqDT@dz4v!Rxrm;zFekL3A*aB+f1enO_TLDukmL(Y zPh+_j3%+eQmhXQ}Ndq88$Na@c!yLTK0Wi@}Wc7Ij^(eDgQt@doMR4=-+`gbNQ;j%$ zS#sd%wF`KN9FmZjt1I9?n48=#&Ij_$zGONZaSu_#M@fE#31e?4yicr;<6~w0Ad0ej zS-@Yja&{Ss5>`@8yt6nyAfc+28T^6J7hpf*-e=GA0Bq*tOB81HsV;%nB6-Gn_g-lGzj7xvs=l?12roV%G z?^MU)yceLu>=xM2b4y)_U1)M^3y~E>>t_+&#IHQhusoS?hLIK9`{lIhh;fMW*T`k< zfik4o!QWrcaoO3=qEM(f)&}W2FU*hwa#X{8Ft#D6rqWDOu>WS5g9q1UET?iI9{h+;rHOWz(s% zm)?%A&RY1y!@WDq7BWGPh1uQ`djCN4Fnv3b-iHzld!rCyqQqMr$g{W+rNaBMzuSre z>Jb-Cj?hq0aCB}P5Iw`iRFl%u$vhcCg**vXGN^JsG2SNvzn}&7cmFtD8v`O@`7p&f zDLK8ay2ChThQ<(hH^gs>%?|>oX+!IvrfZx6#()yW8zqi;D(PBlw-;HLC8w4}#IKHu z@PFUq==*|&>uGbShiJOsW`=*1S10HZb2*ZcrR*Wl2B@p@Y0NoeRg(Uew~BmAQn$&zT-C&;qbIJ3{iLH4zH^9phFsYUXLp(@YV;(@U#R!LD!YW zvMHugff)x%=Td{k1bQA7a|NR-ibnEQ#mdL((GreDV8u@tJ03+SmGc;qS)5+27zI{WE(XPL&O!dp-0Q z1!h^B9l*Z7jB_d?z0lZOGM<(Ld^()MC$VV)>zT`g;Km-r!C~2OF#KX~GBI0!nwIXS z#gcYW^~}G}vn(&@tr(Gbzh!&J+u0c4_yFVJnZ|+ivN#)hW46g$=FJ|!>p78}0rFW71)^j&exNUL`oN0L$8A$sVAUO3yCgCEF%??^4@ z*3e{j%oXUQz`L8z1^mig?8SXD2NcVPeF?KT%&?VwW_@OV0eD>VsLs1EP_0G76ZvT_I71Am-pe{Y8q4 zWkAJ9-zgc*Ib_NReEUItW#`e3oivsyS6iGP8*jdw(ggx5H~4*opFOR7!N}bm2r__c z7Dkc8Q*$$^B#hfxI}p)YQc)*YyHXhdH6ex$x4onp7)5Vcr*{A2(aJA!<{URu=Ju zdP&lGb6=cdPS6Ryg38?cBAu<$(FW~HF4TH*5XueyQiccYyH+N8hVy8#cC;CY9-r!< za~_7bM>x?=`35!ur_fP#!Vlm->L=X*`YQ#)axS=`^0w=6o9TFYzW*A$^6E?>>2H5J z<>WxM$2p-ws@O)yTbg5+l#!+xTD>PWQ2v5y;PY4MrR9VqI&>Yu9<7$Hk0S=dfq3)& zN%&ToVgom~Gwwvc6Al?F;{NEkbxMf|N9TjQ_Bvun$|MAIdxQ9S#99-7)BL6H`Zz^8 zjQ-Wf8K|mH&|DVhMHuM}1rl3!!JOnq9|IxrOt#Tr+#GihzyFE+^8jI&hL)Bo?@%W`GlBLDObxM49E2m@yTBo z5V_F*s3PFd&^5px6h6emno|2>!7*J8xlUi=W^Fc=$rzYY{m^uG_ar+mF{F65!(`j6 zs-kr!Bl6Fj3GkiMniI}D9qig%9$^X_@_N4XgUdVAbOH%#*F4TWQlm%B)e3lORxTfZ zfB7gGcvx z4@-zNm_YU#Z%G)+#&Cmg8@G{@Z0(A?2r{g`T>K5VCX%A2E=07{>XCrzM%nAah)38` zzyCyu>l*OiZ3%&mlq>=431_feb#11981Ad3pvktRZ@wh>F;ILvA!2CrdU#6rIDk{I zHvgi_*Ps9*$zvZl9>M#+RM0)y8ybDdg0;4VV(ihQZahS8TLjyKU82vy4KUf*Xg{8_ zBh;9$NE5mdn1z^1b+nLoyh&1|cN|T6KVBY-5IVN$`55n7|aGhP`OjR|n>GkK!Nr>Pz10b;~89M6%LAxiV zi1K#4BK_HK?<4*<8s4(8w>ee2woZ+APenn&U@I5l_L-NBKXb4YcJ23V6IWnXTz)H;dhBkaq74&aUsQmJJPJh@cWs-C?Mf)o}U@84|F4f3{ zoJcQ2nyR5MIyd$M_D_Exg+UA0#b(tTnj_HZC| z(@P?t89*oax_fu0PnbRMb7E+O`ikGzs6R^pM$y3U>2C-S7+9FgeBX3&mTQs7w#nOq zDn0>=khS%4`Stb#GSn1`#Ii9eC&})xju6f(! zF7`1V*~X9^e6gxp*urm=vB+&h8{^|qU-U8Q{whl4MT-2l%2d54MguZ5-fpAZXG*e7z^U<| zbnMieGNQ`^EOwz81l11k#cC?U8ltsKuQn|n^#nbD&lv8HVZSe zb?&n9=lh$R)pZ@YmiB-LqP5<0_)?itlqw0FQb(=zwVa|bK)d&7u|@il+9)zb+MnWr zg7%u^td6cdkeHl+_;az>-o+KM!NrB>)xT@^@iTRT2Ew?Y*r8fK_r4mk@(T>iw4?R0 zFcsE|k`}c?te&EQ6uexdTbK8Zem--{Y7$`RVFAlSF-gx_gk0g^ z)Cq*s=y_2F3Go{CPqJZHj`;qB`g$6{l+FrlZrD_XwFvU*B%q=b3jo)4%ML;>{iw99 zj~nSMd&m_cV5V$!2eX*z!P=Lu$G+YX&@e*_pv>_7;V4xwbbTIskUVJFyE@%!#=jS& zlS*)uI4&^N5jE%uea~_=(xRVVi?4po=jtL4?veVEZGEO=eM(MzCnHK_ZFTREz@d=Q z4NvX!vQZ!VspJm@RlDI^)X{TQxN3rg1my>W5qgS^b(Lc$m(J|jT#v`!N{2%YyvWH( zAQkP3h1{(y&WD&n8lTSOOim3Wl7RKl)jJi{1jZ{W{vKoMElpi1k1wSAi`$kbtjeNE z1TqFNi_M(&k>p~L7@x~zcG(*neqS z^e8brM1+!{ut@pNn5?Yc;W94P!1hyBlrFo&zH=3AcUGc4&~Nv7}m_mBGHP)UBWHYJr3i1ufaYm>9YQ0CPa8a zbP}^p2_KUEyr(xhcX)V}NZh}Oh-|7>TdxpTt&#z2Tyd zMe@l?J@4L3TyR?7$Mbj&gmd?^RuO=3ymuzV3#IpwBAa+AeLFV{)pBibA!mbu}eR9BtM#p^7Mm&CKqI38;5MDJb4Qhv{X*p!JoZy^ee~ zAw(DD@jwraePT`a|NG&QZ9wHqkvBXt71Xq<*DX3fZv2-W<#S^2c zO!o~P-PoFLfbr$FbwM603tGP54HD{$i+Qb0*-X3xZzkS;j!f6wuHG-JFMgYZ!xk5@G7|Fm@UD*$ zFBezl6#Hk1XD`?ORLVZ0At#TqYkFT=;)M1UhKK?y0K@vzCgU;!ApZH2_@-6oK5b}n zvO0UYAdky_P2@?j05jBgDvLKHVodM2QCYKGco8v7l$Mi8pMNy|TU&egT7wtibr}pk ze1x6$SR=T&pw}L3>9 zqWV}$=pB)pP6%&oBy4)>Yj1|Kn%3uYf5=7~%u7OO% zvFnwR&zRX~>pj^-BuJ}9TYRtXrH%)3D`bz6C7VEqspUr(+$4+>56|%5$46=fHYbBh zcGt34($A14bkY9?Oz8ND1bZVTMSS(=a9xk98Ug9hHwh4^$MSyz53=9%+3fI|dez&x z;8A=$%UECMMuLqMT~qDa#WrZ^dH+%eEq1aO<}y}eW@^f9zgBN?jfNiQsiLUAY0bvQ zMsaeI1POzj8;URFj=M(!6S~#ml}qsg(CD3xk-Bpz9HWZde>`dER&LQHIbJVlb-Xd|h1RLL-f(gAocIF7v4zarlw!x{Waz zrHH}s(B$L>KNGQ09!|}w1$SJMV)B#Y1Nex4Ycy1?ceWI`BIS)>NM;p=?7qf9!9lNt zhq)5h2W2xPmz7(|XYw}Mv=T`}D~K7VVMC|Jp@fR6Lx21oevUET46n8P6q&U4N+@KU9|%o$xYw6we4BvSwyO!s|!;2 zy`8jL8qIEX!#QlSned3ps_N?(OCbCTLx{s>T)LPO6Ain60UC?_UwZ9J8m73;?|SAH z2E6U4)cQk?+~aG@Itdp!!qeglvyZ&K_4+Kk%L}I-|817De46E8`w$8`^7(r(x8W5d z4aHGRE*Xym4^LJgfsTa*{yyQ^CJZGE&+=20Rz?@o=dNh(;*`pM=mwmXRYe|=>B}P0 zdXv?RC)5`WR-?V99>0!6+n15Cd`3^HHU>sS&pb42eAAf8_1_D@w=NMdX(re-}YL_kbgk;1z(r=)Gmfiqh zo(1-`^gqa(0nT6EweIrqa!v>XuXx%$LILyATnA``ux=D>B4HS%l%K+c-ql`pG`tV zrYx9!XIrXzSvYv2Djc7xVmWUtD5LC1PcxRv^**~t`HU@spxsB*dT!D#_hC>*c@V}h zJVheoqZDOjP5H6FT>G1wn$*-jKG_+nqbcu0S4wc;!FX#Odci>xE;6%RLR1vudJxyk z%d1C}W7Zck6U#CI#~*Q6^p!Iiqz+cJbW~~Cxv50x$!OSAbfiXV ztw$BmDVb%#B8&2rV15(uD>~xk23>#i%gV-{5umv@8jovJHJ2k)!TYOS20TkpWc#|Y z-WA!t1Z97-uj)cMu0x91`C3SUX}}e7wwTx1fyx&se~fP>8j5f{lQ@DhY`@lKZ)^;$ z%HG%G)Odfalw{|e;fsY}vZeJy7MafR_X^xA@Y0n2c<$G0Fn6AQ=VP1CF_n)&9jpIE z-E*7v9%D{&k;LK3%EFO;D$JqOXq7tM@tz!U+2fOkHqPeDvS(sCEMx;7Jj5gx+jomR10`mUPG z-cQ5M%X>68m}j&6?rN~#>#ebq#^}HOhIznaNQFl5Z<&tlX`(l0p7Ir}?(UTBi2((mM0 zbTAe{2Z^l*mL5hR)DNZ1^Yo-3b7;HTWWPIb2#Q!Fk6jiyy!=y4Tu6k{fMNw&JPixS z3w1p4h@ovGGZD_uGyx_|TVRddMY+YJ?6yDT$G6ijUoUculhZs(9YyscKnD2URJ+=- zz4&+0w_C+>qgu+RVCu6(0QMYc#+)s6<8+NC5Q@qq+NCR2zc8^xk~)w1W)ALxjiaHh z4pCA6Vcc%iDH&U?vg)$Uyd4OmW7KsNk1exwH#Q#7YwgSB^Jmce>UAh9sZA4=`FDQ% zaktU*=V)B8GGz}h9$s$r;E+zJC-CmCcE_JvCCY6eLFN)hQBoE#c@`ythZbe$bEDkwAmU zG(;J5{L7Q(i7E=)&TQt;LzHq8V4ISa{cIP0{%lk5; z9Br7Z-q1J?YFAX`DJfZ5fA1|eT$YOxh%@W>H|9Bs*{ni4vQQm(Gzu)(EseblA0p5# zuvmpVPu>}obtK%De#^j$LN)9`)9cUm{1ga4Vmx{GX)P!pXWKgGvX#gSoPxsa!4V-D zz-!1l2qZM}h(Ng_FF=(vO-0y)o`s1V;lq@e0CNga2k%{8Ik{$8S@q&!lY#%LDMEu{@;8MHD2f~rYhBP()aM9lg1%w@)43n6gRJ1^_$4g*gMLC z8e?8sTBoDX)Qs4hNDyeuJX!PA}3($hM^~02V+?>f8y(d*y&Nm!2r0 zZN+jq@HiF&@NIjSVX6gVW4~(x;{lB9>=82twBM0EYLwjFiy8Q%73g|VpOclfSoO^{ z=So$D8}{~G{`?Ui9;O6Dvo9!9Xh=*AId8k~>$kwxYDx zxZe=Lp5Nt;PsZ%nC<08G0q~;s_V(X>y`f!;JmI3=KtqzIPfm`e3KS~azPUBc0597e%w8blPk@j#70a4uX}cPjUZ}xK#B0*;@FYc3c%iANI6*^O33U4sPzO849Myw-%Mi_Zr)dax zq~c+OH^ni>D~HL_eDDZ+*IUhTRo}$3RzM(naO#lhScw6Z-j+(@ zN&+<>QnT71U>C@1VyaWhx=&>+wL<6~{Y@My5vy_s_2n%L-xf+!qW(1uT1rGSX}}52 znIRtpAXYd#alSR=W8guGavX%mxt+Jv6%UhMuX`X4@0<4}>DcNf^Q= zQ5*-yjbuaq8xGyKOtd7pt84U)4S1KUYt%Ry5(*8=M}CmB2{F5Pj~mb~tTjWAaB}7a z5zI87rf~k8wowGYE^hPwseBPT{{EBJ88|>R!VIQyxC-KW|LAuB)_<#8i~kJ9pJAJz zV&5iJB+lK);mp(zZus8BQ>Cza8?E*_90SoA&g?9fpD$i)hCNwALg);FXeq^IrL@XL z#3=dr1WV_-BetGzU9laFXvXB|CgfDK+;YBo5Nf~|M}u)X0;d)3qLM$5Lp*zt^}m@8 zeL^T9BQ|zxNJbI? z=PaI}oX;03Pwjd|WqKoc8iEN?2MHrM7WOwgtH4sB3v07FJmYK zuQhR3tZQ4IbMhO(EMu2&lioF?6*5v7E=5+Na1EWN zzfWp)E!bEXlT#BD<6}F&+Xp3eb?wcmL|t5**-VTZ)S`I`DrCzt0Si@t1^Kgzc=Gcy z)d}J)^FR){B47g5?m-kei!mPsMFt04*CotYHg?_kTw&r9gM+BZkNJUHF8{7ej9Q$4 z{XOGxF4=g9n>29utuL{yi>8yB_Kd{wlw@SzH9=}LSy_4IeYK7T>*?e>#pQ&?IS42Q zBJHg_c90~Kq@sq}J1w@OJjCc1!e^uwEz&236yl^3Z>-M^@SjRX24i}6v zBH)v7!?Zd6+Wak<-6@@E+n~!4xt(7+DKQZk($mqgXusK|h^`7RGBT??y+MEv=dkVX zkwxtPgm}yc@b~X=Aqc23%p)RTNY2fD*cH1wo}uWe1FaED$w|%x$m?S{r`{5#1%PoJ zTrd(Luh%7R1+-pugZ?u-+~SF#)Z6X%b;r|w zodBM!rr)L$b>+2ia&l&~IDM_hfdf)FixctS3}+jJP_$V>2D}@;efJbP;j>=w%-L4O zDgPu=&C=a-cFwG?uUE=DC@N;vWWqBlh`PF3Q7+R`l9z1LD$_n;_yMB7@WKRRlau%E zj`c_gXhDN=R7|ln|0rQV0pc6Q2;_=6-eqJ*9%9!F9zC zsGcMGmh4udDbB3VT97VgUHb?8>97>O>9ReEJ%EJ`Io&cPzR$G!DyOYw-KAlFnI*W; zRV$X>Qd)-(&eb=ek-L*djQ2CAf^EB5Yih8Vcb{qGvPvR;C_V_uD2pt!v^SU36$Ufe zC4D@exi~O|Go$pPw;7=T>64&Bpt`dUli>Y`&+>D-k8CVaD#@zA_F0gDnW1RdZ1X_o zS7u=H?&~{#mb~T!sv8j9(wa$rs6X_Jqemu@HJoI4){7vp&Z{}@cVH8Z^f;9UTvcI%K3tnSCM$cnCCLD1r9 zDM2#`_g(?PXY1fO#!ee5=L`6(EHu<}O3oaU^HTb{Pv|D6Fl0%kUK~f}%H#i!zB$Z8 zn`{f}OZ&)VaogUQBagOZ9j@MqbPCtkLpwU+4nSc-mm3t3X~;V7ypNmUV=o{^D_}zI zr3{EQ6Yx2u3D`tJ(uzR^Dd*Sx(@iD8h=u&7&RA^wCXtcRr6{CD3T&OX_@gF)VI`~l z4juI%@ez)DMG}<@HIrM9tuY&_AL}}-oKAWn4&Pr6J0h1S<}x-ivX5|maQbIdz&r2V z*`lEy7j?ib*<3XE`7_k4ihUp?3?Qmp!%>k~!FxT|$5CZg#j&LupOgZdqt)lAx zwm<TnR72$H)2p3QkCCqOvjJ2D{|+T`r%4q* zLTm%d`07C7pJa%>&2E!HqalqwOjb8+2*bUy1Kv%$ZMAhc&X z+6xDLS$tzcOXOM z(}Mp+L@J98VVKBDccDC~Wf05==V?kt4nZh@sedXio&!z<1W&5%tR+Xq+5MSro$AFW ze&8FM4GdX<7#J!H1)xtB&z8IdBqD-va#DWRL1+3vyW2f)ZRkLJ9p44O6FP$7M@4O| z14!!HT$-Q9s9=ErVdP(9W07$i*E9VfUKFl0q5NUzDfFV_L_nJ@fB1I7P=pANqR?ki zi&^7HD^?C)^!yx}%R}hB*QSgtlbARzDqT^NCGx8>KYy&Xqg}*Jj9qP-Ei4SIT%;(L z1at+WfzHAHdwTQ`nGYVgL!MA#Dk>@p3gp-BcykqQdUs#oi>;HBlXbueY4xnFu|d?U z?ZS4NJTo@d_8%pq;VSo)O{Jt~Vw&3&=$}%ciJ*`kGbN??kLlE57)R>$+GAm8dIKDp z%W2D_qM|@;DmG8F{N~{3h^p9BQ&=iGJ$b^R7L$pd4<<}$X6&d@3qHa)WujDhA`S%e zjZKUtk%=uGwgFX0!W+J&Z`8z?*~!-unu?k#D)kIsLO@A=S63NmcP&9L-+Tzo!u3{b8;*)$$5tgoq&QejE_q;!% z^jS>SiuoV;5M6p1Nr>|#H5=no7?SIY_bjC*cLp}RB*Si_CFz0bwWtN>;r?1 zYSAwZdLW8bQeqzOg3V#~J^keN|79TVb57rL(|UV>B1~%9r~x2NNT0V^(nWD8JComv zgFwe1Oq)b=MLv2XhGfjq(0EUuxvO_D|3Ch50jY5@28xJIBq4IrjA5GynP*$CM39Ot zW=PjtPDVzCNA?N&-WX}3owH?*v*Ruw*&V6z-87D{APX4)C`Q-C1KW@;8XqPo@B!?j zRJmaP*K`8&Uce%Vs@!p+lZ%R#DDxy%G#n*_=ghYkc2Gc1 z(vfIBNnE?}MhuyDMpJ>4Uo4kF!c<9d@l@*yI;Kn#JxeMDBQvz$YJUEkkzv!UQ{25d zjqL~k#Ni1Q%EHb9`2y!Bp6NsTT(PhZ*F9bsEFZ1bd2>x!d!(MrIyPn`Jwl9ggg3ix$Ml z!OjcpziBIDtt(ImdDnMZp`xOzf1$eg4x1ZqB%eoVTAA@_I4#Q#cR+yUtMh>)2rlV6 zfcBuO!_M-9_~Ckf2kHK&g2+BZ7l5Lg1QQ^1aXf7}vtD-_+)d2q#C3fWN@6vMY+2sv zVgCh6GE9ofrKh~zhYv#;QyC7D(EIDxuPIvC<}*&t?grC>+|Vmq?JoBqlLl6z_;R~7 z1-Oc9n%Gj(m7ZRz|I2r2ivgX?O!-HH2PR~+V_zTXx5*Js`5T;d3aSdiEu30G5YEIv zIGKFsDgw6lb(J=RIzF!Aspb{^xY##k6K}S-P6DB?WaYL1jR^p?4jf!N9v%q8f{<*i zjP!q7X8{va=UOx45uNpMM@wrRR1ZjSbOeRyD>yibnp&+{rN=OP&?{F9?f5EIk^CSp zf!o0#nIuq?!0x5%B?hlIk;KL4wPc0%wh$K?Squbeqks!^1@=)Ce7ghUnCo zbj+l*oHB^0;z9`)`-&ip@>vJQfp`aG05nT~SUkdiH#t)XPmtyW2}c^wox!~MzN8@J zRnYG)5{{nE?hs;vfrFz6a@yKCy{96V{Rq9&Yoaw4v7-VIo1h{|U#UQzT4q zk7wLFKH*Rfq~r=9i|3F9+05J;4HRYI8yvKl6=|iBJhLA>e=Zj>(2ma zHb`?2@dC|4EC+^K+=Uwb<;lqukd@}hAMk;BEu9h`zVv7#fA4;}*WSg9kqTX5Z9QA7 z+!ve2H~#hTz|S;HZm+!(VO7!%$@L^^4~@B)1pXEIu--%{Hy6_5>NhNc+5cTpnCR{g zi)Gy4quYI$*xuNrq@-M~DW}^39@D)!5U@>S(4Pu*I$f)lvb2PS1G%#Sw`YS>_*%!# zhRneOmh%%t?(VrPwvB4(kpqFlv%(<^JrZm|Kj_#nA1=AUI>{x!0TOdf6-Z`Yq-osa zWKHt6RNxEiu9bUy^w0^#^8f8B2SA)i-uuxD@>21N)WP}vOFYVP;_jP1!(LpJe_Gzo z&W{l_aAgg(`9FUo%W(tz8*RcMka!x02@&_IJA5t1(xC2Cu`*MdoaT{xa(2U#LZ{tb z**H0U-X1ukBwJ)q&wm&h_8K=Hx`~H@dJhRoWu%xXg%O1RAbt=x>7m1ajE5F_Hm(O` zr2NafTa$?4(P4Cl-G5mJpa|elx7=Jc;Olryp^CqIq&*TJW*hi_y_q5vi=ZU_@gFFm zV9eJx#*r;v^J_!`ujtHW`khxUS~%#Sj)=8xx7I{0V%86mI8Gh_0r+rOSet0%^4zv{ zEA5lU%l7f>?USyTyqiXYXhY(3*#F~!V4G%JDv-7W{J!WE4k z+j*pnHxeHmd~~JCOKz(a)AWkv0%T{99fxV*!@<5j@ln%@nf%S!Czjk=8~HJUSRC%Y z=X{q_YzA9cxECEQ8}}7YK9|Zu^X-wkatPF300H3>cV;2c!Xk!TWtgz9W+CtrNrqft znL0ebv5O%>jtoV|YKJiI%pgOo<_{SdoY~n^dVOU)&(ofR8c7O_H#=nA-S4=#3ir@K zswec-1~~b91NeUe8+bCh@q&ggbrpaXDonSzs8VBcYC7J<58L+!EQt*azaIv9sJ)E@ z77aCSB`oYe!&`M|c$hB#RiA5`7bkT(Y4pa}IJM!1hO7087Vt?w4u6F*$Geh|4qHKR@kq*69}V zoAxZscubTbkJ1lePcbu3&rB;!OiU>#tTuOi$&5iOG}$B6{Ytp@Rom@RyY(q2JD1+j zxO_V_xQ$k||DOT@1p^i4Y-6+Hsr>SUY9{30LJIue7m9f$)NDOn`h(KaVfC&QV@oYX z%@dc$o$-kKhO1RM^3Cd!xA^h96NwRd8on^rM#!jzV#nT4(+hQnH|j)#LaYD#(X!Z)#zw0dH1 z!R9I^F0I;j(e^vyv|s}_iMzESV6%+O43)3tYL1h(A{0t#^&#H zRpNt;7T*9p z5elTt8(T+p2dkD zf!7iSuitrm0#}ihsoK29jupN8%fEs2EQzcD&{|j;CIpRo(~h)wCZSBd+`y<;5H(6X z+mNfpRlDVBwn1tGq$x3x!6$s4oY|Wxg{2kcSQZ2m;hRrZ7V;K{D!ZsAZf+*r+&pBN zS^Rl#(Zj+2jK}4cOF+k(q0HGM1HKIiVN%kO1^~d#}6T@olTBs@B)}WpLo_afd0^NQ z7j$-w$L!rHP$4a(!kXd2LrAs$x!Jz;QBeui-nBHyxwbaf-;X3N{;pzP%&=4ZUeBGLj0xo%QSI) zlZZy6s2&=iZvJB+APRY(d~S}OYV4`bdU)J7>#W=BOyu2gjRNp^xs249t!!^1%qOK(KJE;+9MDRtTIQ(xHqh)JohL-2_*DLJ}RrJ>xQ zGmt=#pjM9{DOr8BY3U-);^J|=N;6q?QCP*$cW1hmW#yM1=?2~FG_BT)I4k=kmc{Xs zgyMH{k}^3-`telGe${jJOi2cC90LAd6w z!^e&X!5j=+Z1|h4BSTS$DI4yLG^DacU2%nZnL-HhT1SiI}LQUI!&FW+Y++ z0}|!HPy7APq%@7h3`VrYxdh<>V!Y;GT2MW?C7=g#l!Eyr+ z>43pK@@F9&s$etc{kt638&p(eWs@;EqutLGxSD#sADTaZf<%0oWKqQZ4Eucj+PwK$ zsZ-}0DRy8Q!4`&a|J|a-^x4Urjdm_2qVEpwH}980p8%7#H(fcou}4Rt=ybKSHP~?2 zrybyV)yabX7lse2HrRN4m&7?9R>LX^2?FQLPp`CVX(JJHYPG6h6-rK^DR^-@_O$(o zO^u@N7@iv%@Ah>A9hJ~pH%7XS~2_l|c|W@byK zQ%THLt*+hB+S)74<%ON=S6ji^+4=&UoJe*HwHEE|?UJ9-d3oC$j^+ttW)c&VIoa57 zc-^^3Nc@77e{44OHXk#f^XO{@ltd6l*93~{=&atuV$VD7dJ;^+o9~-{8(zW>m419 z=n|CR)$0m$Wa*lWnUnt$N0Y=l z1B`}C)6;SEHj&Zs0s34TSj?t585zHo6%717M&b-{`NH_VK>-9ErMJ1cxrU}@ZwM}@ z^FT~S2Ep<0pP#VzB`yH$7Zn% z3B1*Jdou}T>d!5SGuDqxR$Wt5XSr~_e`Q9Bl&j(NcxWMM?z-V$1Mn3;1)4q~==Fc`Whvr}YH8J5XLEv`<5J@Da4^K=?e1CH=(_m**K9cSuz#mvau(`R}6N($$CzHm- zCqJYDLrOrP)6o?a!LzsLvE$-GJCgSP{d-t!7C?@NJ>6o4>2=(m6;znblmTZWqCVkb zi=)+Q2H)+UTTq$7g#nbTi9{(PfwJk7@Mgz(XxF)=X$ zf1mY*!`*LB*DiKOg6Ora1uYh8Q<-e^5$p_1Ra=IVdy~m&xg~2H%KyF#$U}yLPKPEDVhP<9k^LMKThKv1;8Tu(}Hi zV{gY5O_nwIRoSU{cn&Z3PGC~p%j}=`n*BN7-!g>&zZPlHDT>ID_2$&>7#R5U#>Yc~ zV5p;#lAvo2jKA-7Bv5{+x8t_A`6?^>WuXLkDOEh=i7HwVXkx#S^mxdcMO z_xgKzc@^y9n|;kL3keOeMge~fDb$gRiq_Q70Eq(_VPj)*ln?uDcKOmn$*dmF>+e@K ziNE+aUS3|RPzZR=ET&vgAoua)~A0 z&Z6^$_>E1IEC!Z!Tyk*O)@36V5r8NRA3m@rCep(n_qN^Z3p4(6s3C+_K#SQ-hNvyn zMwyy|X#~uJzE6%(ZpD;fulJJg(qN}LfWC4frKB8qI8dgNkf+=FA|c83fq={CwbIk$|AaumY+G4nhJpQuiu`jPzt?5^iUFbe&AAAmnk14hmP_*U zjh0PFstkvqVWIXW^5Hk=EPhSY0bh;rvWBwqOqs3#Jbd^_IxnMc8=;*~bE|7V=c=D~ zrV}~wM!v#l0F-t^Fb8-&lZuLJYilbi=o@Rg#;@p}^?=2+Nq~s6DNianjlB3C8UG|JT4VQIYsip7Um%xd+9nQOxJ)#%KbW~bT+<*o9U`Kl0vqa8LzPj!bmE(nv22lt>jR^O zjP^$RgRJaqv6SWrh*EQ}e92U3qUXxEPx&1UE>5@i_w&r=kaH0K+(Fi3n1$xaBE^H- z3jI+gN=i!QGA&#dS0^8TSfin&*b`s8<)o6leC2%0C1$f}6;;(z^`=_?3VP%qx$B%9 z>$HmgSO;x~gXvPXs#~v&U~jYE1M~BPi>{NqW5>tGu8bq=>+253^L{Jn>yAg$M_ob4 zcW}_FtPktmFq@vP5!jpOoM|OoTwGZ5b62}_(KmXXRydd)VX{4!WOve9r1F9nWf#=| zY{`i}b~`K=H=ZKBkt5p~NpJgcgaz$yTmVZN0wMQm(be1A+sQ01btd8uRen_%E>4I*_Pj@{}MTjFCh>@tM8y+7foDU z$zX}2(UzWm;y1t{S`iaezL+jm_X+se{m#qZaRMXMb&ba~@CA=Mwu&7$TiPkGnYT!{BUfNiR^%$99p z(cgMQ_p(n5>X-0;s!kGnK#hovkX;pNIhgip9*~w6gQr~(5*LTNd%9@!e%A$#=9x0h zg%XpA{%F!Ix5kUhJ$W;1Y!+m0w*1NxE z3%6Ig^@gRj{8&2`^BpH4BwPc}_Tgt_ua7L;&i70w+31RjQ$fgNt*&S5*QM%=hJ&6Z z>@LR|0b#f{58F)==Vhz{X!Cn>*}WcQGW_b9Aj+a|F zZN_r|_Mvb8^a-%qqa4G-!+U#s^>_x~zRi@Wv)WCU6!5<1n4bhk?&ZM>KEwR<^mI?p zhxnBYKA%%$(iz2;A6JIM30}#ah4T4*T?~)h9>+M}2LbFs92m+=AW-jv*(}M-^!kIU zeD(bFpu6@ohN{XRhfF2|P!9Tg332fODi{^5R#zxp-&&<|HRi`JuCA^rXw)h-KbX@q zf0K6rT#EY{l)r6+vR=QX=F2USvjbBp(c#R5mO6$0||$h?Y-j8zA$ zmLWLoQ8yV83*e}!SOFea95&Te)6Igf57$TfMki~#eH<#XpADAE0|Nsw8I8(R8lq?@ zp{srjB~@(nJl$Wx#0n^7c{|#tb?R1BS9VB!P=okhFA}STyOd*>ru2^IK*#JiOy(2aDG5 zFOrgycs#CsXihh@!;WL8@ii&zZdlARpLX7BVlEEM>5{&|H19aN?G48d*^?nAAsKUL z8%jU9Kg8tl`&FV^)iW^>?Th3zlg#eC{?1@4yEmNo88bj`&iP_{NX=*mfb4bZl?FHL zQVCgE1X|Ja4?NFl>@L^GEiMB79~)eDh8yg5XtnBB&o2veb4gAnsjB0Z%QU=9@#*#6 zWf4*dHrd`)HZDC=-`||TUrEmEgM9{^0l4=!J1(%YefXvlQ214?Sh6oL~7jWw7-5#QVH`52njV>LU3q32yi74`&f;yi6N@Yz+)yvm)Hh zR_O0ye5S8=a=Y_>zynROL^uOV;1pA@v${B9TAe8$%ND+Keeh4c-kX4ULKx1{fPxYU z$6>!(@whEqKY4b009$7m4!5{c`>UJdl{SYbj^odDmdl&mZUq8GM z>zN;9;hTlr9*&n18IAlu;z?3Mhn=Z;N{^J}Sh~Af1D4+L5*0i8_<0n6z8CW3XWXROkQ;K{qyjVJKJR+vo zN^Fk5e{~uj4$CPgV66gw3!>t!i6jwlm21m9UrQBrhWDv>hFXdGehWL2 zz=P`u(K-`Eg}ZAinaJ>M)${cz4&KQMaOwcRT2S(l-HwG^s)7gx6 zuyOn+f-YPeJ}*J6`N0gjK^8Io8zs#t;{J&VkM5tps*Iv|oNbjxVBmUQ2%^9s1?r#K zhPLD5-VP@}TOH}Bc<_#p^t-R(qmcgYE1tIdkUSz87G{^_DODLt?S8u!M}7Brnv$Ao z!#Y>5>(kQ0t^c(1yS&k|<;O&x%uY)rqIGuR*X$20KQ%^$RAh_m@z1!YKmC5w^w8ue z_=CSB!)UQiyX_a372_JM7%}OJJSz_Ys|^)c;?jKbIOMU2)nYjn#5(Gd3Mez55!;Q~ zN2|&;n`*DqCmwQgbHMq734gQiEhi%IXXdZ)g^%yb)8niS1eL@esEyv0dK>-OU%Dw) zEur|~gJQB3d~ZYiG2^%LhLYP{pTw{I+-Krw)US`1%znEWQaC}q)=ow7O{iyU5(TwU zz4ur9?;jugIqTl;hfa~dVkg36xkX!yijUtpYT#qwU;K#0_RIP!s4k?zc^=km0z0B8 z-YkI!v0%V2%_fJ_bp`kPOQ?>(@-*Pc4Kb zjz}bIEr05R%V(jp!Ce;}7a8dldS6=$=0~&foc?51t3Hy;p#^Ma4YD>URH08gQ;N)8 z1_nTY>2r7tMsI((u~QmRKL~+!3Thz+Qz1f@UBIGXL{}4@BjXad0l`8V`q0M?Y@O6-BF&TfWE@- zP15_YP-j$D_H=*E?Y58dpc2fcq9a1SPLP+Bqy~$OEK0kmA*!a;C+AW>Ru%kZ^F6L{ zrPgrnpZL9vL`r@+2COh*8mS?E+K&jn?z(d0UT-0au(2xPA=fw@_TO>)K65;HBETYH zTHUXmXrI%QzB!hV&zHt!x83xEL7%U+FtgBSbn1)vqg-~XmFpeAc+P zBC`Fr5-t*uBK>Gn^YcGC@2_SPn9b0x$D4(Mx2TkyTwKr@EFduy%Hbk6H#U5ryR@ut ziWRH!05|?teE}Yyr>nQuu6lB31c5u({h)Y&e6q}=dh6-4<&E(hG!d1R|>)CQ3v*1UtgYm zB%w2yE(!{Qy3je5nVgz(_INNcF$r4ChfJ)eg>nBTOm3~qS+2VAe7v+hl-vZU1x%Gh zhFrUNTJVP$gy^VQA_VtxzYEq|9WFK1DVJ-3(cM>Yp~dAUHyHhllO$X~1+X_bI5>;T z&ET}}+U$?kRc+H1NUpVTJe=j*XMy-OFfed)Mx)b~5Zu`-gM9;i*gZCe=vejia8qYF zVR#lra)^`K9Fcz|%Q!YRhM=|Em-&a1mR4(J8ldtzcX#*5uU{ccHH($R;>fc%g%mV2 z@_yKt^0-MVl}@KKC}ygJw?b##nhKX%sCTm4O?h~9eQ7lu+OIM!tU3A8j0&r9pDhxa zMDNCt#PrD3c(Y4yJ(}_Q?FXjHIzZp)Sc~?p3kA;x{4^R#<#gIbh|*|vb>97@8`yua zJ(wup7_2Vvtljt>>OFZgzN+x5!>expmTaH#OnnhH93s;^n`lyGQfa}+lBul2>w zkiJKhan1;cLfkC3gyC{v z@7ozd;pS=`ht)FIqd9<1IOymB%kc9O`b62?!RP|>;TZJZxKE)wVSS(AXw3vS-qIMa zh12`Je-N?@)_z#I-^zH!=Bn_ffqfk#y`gcPrO|v(?$YVB3Oz@sKC_^x2#x(M;+Fv@ zCnqM;Nzp1#zbIX>GiJ~_9*T#jr& zn;uM;p8#$wQy{=;g>^WX<7Qr3LSkbG;wBUakIr_qRLKHR)8;sjNXe{bV5tZ1?ivSL zsl21hT+R6C>iCOJyKUXj*w`TF+(k{G;_Uf)XlAAkO#J1pYn;|Nxc3sx(QBvJB1qqh zttTTFZ*&QThlYkOaP@=@HxU`6N$%rrO|l<}A&t$WV#*;3e)=Ze;J27-s?U4TM$$hxyfaJ!E3&){;h%1$ zl3BPN?O|b|&aTebJ)R%IWOa3UnSh~TXu~V55TUQlMOsdd{)}gv_-60R@};H9`bz?QeJeZ zO!F(XOmK?rf?mev#AbnaqbjY=7lx;+?Q|Y6+T=l?H$z8%aCeMjem+|yTzI4I?{BX_ z@0rXmOu$InXX8-n1rFxy-muK{FU_yo*fc8Q%Qc*@_E9-Iun`bWrxudkBmwyJd(*MC zUt@NtqYB2PFgS)R`(mY0gf12H(UmWxmR2kGe;>uD$*%#u=LE0u_3PMt&j-KWkwlsg z&|j7dN%1?PV`2`cYp5ot{6nz;8}@q|esgb7wF+>7fxfqi>MCmMzG-YW-hsTYIIqKC z0dcO^kG?opQd0W$6&+pqeOr2agb@oM1-O4)k{g5)IPy;74O4GqLP`Sllds9qEuEG1 z^45kt;INvBLQ1N};V1;1Mu~bg(Jd0P)DXy}TZ2P(c6NIB+|Y{^+-Ka3h=fGcDBV9@ zw!I11VI(Nh;!nUkKXSRe2*7yc0fsi2RF0`ufdH^M1B#)|KAGkD`MFu@C2!CEf{O#E zOPuYxFxiY{!_&B3u8wIaN%K10LdmJ9U@{#~mav&^4Th4Kbt@{XaahbT8X@v!(rJyG zp`HS0Q#l2s30VzbUbwZ`Kk?4-dW@exCk#F5n0i1!FkFW|>2>z@8Ubb?CzwPO$`5oT z=#8XtTg=xADfJ;-`RKQ65hN#Iko@lH$pajsOghieXy#jJD5I77GjQTsl$6TL%l9AR zP*4<}ZS>Tb%~*mJnuEc{$HoaJWD)u#Rx{yB)6tCfETLcm2u{a6{(XpCA=f}&H#awO zc+rU0aBytsqJf)z?`#lW&t*-FH#u5G$+Uj4owMQeLWp5g?Sizn?MF%Hs4>$=?@FKp ze12YD9#BRwKU=SYub_G*jQF$Uwmzl0S|e>GvcQ<0>^Eepq`16 z5XdN5xu`$PN7Qey&)r0VRA2j!ygeHt={Z(wkMx| zcyz?;{@GmYEs!xj`YH2<6|e~cZ_%VrUgv)>UQ1CUU3LJ~h7dsmaw}6r~Fvyubxe!8p$Ii(3Vopq%!IDhRoE(+r|4PqELGgT5{9BU~ zc^UPhm?!HHxw&~zy>{p#c*Nm|S2_yh3Trz(OKxvHL$S|WLug2e^nsk_TWNcHXP0f+ zTxem@s@p9|TJp~6$K9}c>y*j7wFDz&6gld9s})asPMMEKEA|Bf$~_9#|97`kQOX~A z$V(&jvB1#K)WIjTg|wxnztgvDxx@+u_)dJQc(LmTj8ZiC&Jc*%i#b6lek7YPo0Qb- zyGDqN18^Uwnf4m-3VZ_8?&H zDPjS3#2?6{wrO4D6kFQ{`A_$#Hg}aPQQcs3I1@`;Cj`Xk_4RPbPn7s*`GZlu>D}<1 zlz*%RUzmEtU*b8?eo@KEn7FtXr)%Ng{=8u+33q!eK#5_7^pmb>nwR^N}xc7=IrU=G9#n<_itZE zt2IAFK1t%2AEbbxpr|;K!t;b4-@Q2V@{5b=#RG-vwO)WpkBJE|t67Q0w^sY)?y@E4RFFs#_n<`*`u94>34U zKU|ku)|g&wkH~Da`gyj^`oDKTT3g90!nLYsaJ^$NH38E|7dAFFr}6130EBmUcZe{# z=7`9s@zK$YtTTsGFqJ({{c=ZMZ@+z%II1G+PI;e_92qIpruhYYey)hHa1S8+#|Dt- zA}L~X8-N)4aDHVbEe*F(u7@BtBu*l(CFkmzb+RmVx1E}r0CJL)xuvD!B@JsS4S8lW zd!g9nlb#o17f}S&N&S%UBhu1nsOad?aeDtef)w=Vbeg6snVsN#o0ymYE)qg6F4vwA zOm3HJ*T2u)z-aPJ3}QjsYd)YSq>aw{T)T?&{GNF?`EU{iWa|Am?7xeIF)cLb-Vds} zYD*&FWKKz{efd&3V?^)loa=vCN&@a61#>m1)TyZn{Rm4!OpGBD; zB#L;ufv;cBVjF(F{=<#)g}cQ<-1ZR( zcM5lB7Y;vi#snTq!Q45P&eoRU78@5^rWr+Ch=3u9@l!@#ktFyFl*vNvQNYa!nPdV1 zE$wguCDhNKKRF!_>)nr%Dk2fVuLmtOJSYiZGFGvYVmkfXg{-WsxuBqaEiGW_NyK1b<>dt>C8uXq`iE)p!6`UHRq?PlAIjqTghl~0oM_l{YTF1iY=1b5a>6{3GmN@fI#2o z7eEXZlf@$M`?GE(>cn&=2Oj*>K#a$!Ow+s&F)=ZA_eZDnRyTVvXIIwZ@#=PfSy-HH zA!cx{`TJhfOgtS>TKQ`Yx3{x}!-tdYEG^5*%TFq|{$S6~5sE#);Q|>76?e#kZlJHP z-u_^El~5o89*1SsFA{27&H$%1-cn_Jd|Wz(ZLCI})tAL=nQP_E=6f3ip!_2(>9 zt+$}JjV-7^-GJHoeADsxn9WV{*KbHUD?&oTLlQ5_Mz;qCB_%XuVTxC~7}3nbKlFQb z1vIpH7-?C+{_iP;{e(sf$?eW{7mSWvUJpP~6yR`coF_(VhX2im@tI^0&?$F{rt>6c zDv)=4$~ME;pD<$lHURJOcTB&BjYh@u$V5J$#cFwRYRVn}6yowL%x$K>i{Box*Mh8Z;6y?!4AB zBRCxnM@L5qy0u#Mx*nHijc#SCIYS++-;ckH}q<~ zrE@c^{I;O;Ke4Ii=f04VOd1a^lksS5v9e#I^t(dVL%qe@Ga%kY#Kj4xgbWV8C^(u; zaZq_Ep1@$85?Da)u8fI@7-Y2~ywseChVc7cJVqwySIlywaw$k0S>}E57b~BgJ)q)Y zPPEo=Z{)|k#>B)VlS%vT=(tj2x6=)S4ulmaoxjf)JHvbX`*@r`P@e2bwQQi^;k6GA zT~Cges*HwL!GD^-z5t-e_JzxtjEu|(xDJyAUpG4LE|h7t4YS_|_?oY@a)0t>oP-3_ zlp~-O2z?+{TI?Wffb`|~FU=3+kLjgOfm60SZ+-yp8oAtlVtpwh3&#iZ>pIXyBJ}3w z<|K1?pnt^0`8&plL_{mBmYN#?bHIY;fio7)`vaWJGoomkEh@(m>c};SrbS9kv zRhYEC+GM4uBF=xlp&0(f!m8Y0^Y=Zis772!y4(i?LqjZV%uIGb%!ejm*i|tAX|Jv> zfDICsX8V={=EoY41xVO8tN@Y}0qlEPU2-xuwQolB_HN8ZK@sfU|S+^Ut?SlrTtS+NK!TG8nvC5)#2qk*?=CV!&(Q zaj`9Bpzs`CjL++Q4*0za{T?QZ`4|*+adFe#2twfNq9sOsZQqgC(avHqT>=hD@Mpt$ zz%$BmZ~T40#l%Xw$~h7qntvM_8rm##g-(<5NE$GZ!F)8xbB0EO0J6MP%4fc(n+3Vq zhrvWf8_Zw5Z&aOui1A|MD%FC_pwAYmGLlCCx4=02;pW8gc+nXQM7FjmP^rHsC%=#9 zh#4fyL7vfRSuc;Kv9jKdwu62syQt_~Pu|__S(4Qo@UH4U{{7lSCQNUiS)K$S+^9;fuo5jA7u{ z(t2@70ZL6yQSqB(?$7!J43la|BqSt&?K^rLd3hlV!LWVfc!I@Z?75vHNI-zagrGJQ zo|YE)0cKvu&Fx7Z3})Ppv@9%fA1%HgflVEB@e5lcL!ppq8#I7^o}{ECrMS4Xw6u0U zK45DHt|BmT@p&)-uinir3My*mtH;O3PaU0@^6sZoK}_>@6cn)b_9P@EJzajy_Na=# zJ{Kt#qZJwNlxaq|87xCp;<8%egw+Ac%DeY<%9}G3s4pA9LeYl@gOcC-a083QG&=Q^ z9Sk4o^@G4DnjSU8^J@F=g%NnnCrc~O@#z=`o}9zT{oYa_yPp2Oc`xn7SxeSz|3_Rj zLgA}ot(Ey_i$*ycUR6yT=IrTAAMxlskY!V?sIVE)WXCt;xYB+sj{nb38%HUC+c;gx z;&OK#eG;zM+AfqClXBXZARm8RV>N|=POP65L$aG!zVa(m{pPrFs;C`vroiTWQ;f8v zdTey`S7oKRloUwj`X>d2g~f^R(u@#zgR7-`wNH%I3GIwlpud(_m?za4R{-CI zm6g>pX1RA4B0?ZGFc#$GP&tJgX`^c=_`2Mr7I)92B~5N zwOGi1|KrD^V9YvEW>(gGWvp62*m;~1QiQ+8`^4nvRI1gl;W^8v5q zbjVO_`f&t49`{8=G%~6Ky0Qd1UDn2Op*$I>ut{`eV&d@AY_x0!)e0CWUkG!ju&^+& zuY$E>wUjrsP9iZoqx&w1h@c?u(3=^77#w+V-roG#!l7#00o5R%Q$r2PV!(iE?Q*fb z8H5~7W_P?0kpmckr3!tB2dsE-Io|gs^dl1{bAFrc>hfjp;QgwsyyH?GO^2GZ{OlZq z#ZW2BT%=`HC?c2BR?rxB|7)2l@BLQA(#`d|Yuh8qWPu2Xzm{X_-%9#@toRDe51JajcYaukBtkuyiC2*K z`8^mCs;iWg$jWNHGt8!}%=48HrGqy)G8Q#9X0S(wt8MP^KdV+O%QPDE^TLj~d4c2p zc*~H>$nEWu-W&25hK;J5aFPSKX4eNXc}@GjaU_P!MU1awDt!k73V>+I`4$DV0Bob& zs50Ttv9mY(V1aWM;KB-BQ@u}s&&kO#F!^?My~GI)b|f!PyvK=rnF?UI_A)LkEVSDm zG?}m7i!av}d*|t2tG&`XOMtBTK1d!x6b1tx-MFxPdD&Rw>hcN`86H8BoR?|N1s=YY zFgb-cyzbC*663SZmlgM?4)>8U-~=nxHDAnjxdAIw^tegKyNl2o`pKvQDA`-r*QKDe zAg-3Bxv{Yk7MscX=CtqOxq(1m^4;r$p~1nAey&g3uqz9D;9r3p;VaO}J}_EeUeS(KZZS6ZkhB_c1AU&3J#QzZP|+|}4g0igTmsG6{(*{M`!lfbMuZm3l$ zvR}W_muM|@Eh&NkB`o|QN!>hxoz=UG0S>3HP{Kbzs!n62O` zR$DwcK$2eDf39u?x{UB0ID>{+bL%F*DkxOfR?Gnu3iL!W>4Yy@s{H-^O}pododA=r z0pnJnk6!eT;NW1O4n=@o3}68fh5a)0eSgwqYJV!3#Y$^+Q?QyY>=KAo>b2&H^!gvz zGlf2hw(t#9n@;TmqPbo=PcqS)F_>+o)lGoX5-;Wru!~_h>?^E4bW9VGL-7|UR%^db z+3Ur^f@ZcjFT%tCd<$cC7O)`3!d5bj2jlj-{YAW;oq|2Cs#`I#A`RrH(END${lnGK zV)a)|dHvzH7UB(F*GXFQr8H`AbaZspieS%9=k<`jy9g4Z4-5^)=FxlS zY?d?;*4U)~mNFAibj18%^`g_Ly^{c3C>ZroJOu?Lwl_C(va@~IBG}&I;EdMu8v+U# zvNW|IK}A+JB0XI@KMWV(L>iqG@6nl=@X$~pf#5fy5@LLd1cY+1KsC=8VJG}c5{y2Q z$muwHx&{!jVCdHNCIR91S7Cl`<@JZ2Oqjfui(oDTwm294%O6S4 zOpzr<;iRE~jn9qECF%E-Ljbvy&Eoe+l$7GISOZB(;zdRLH1F`aUS)&dc@8S5kQc^v zr%lvXi4hD6Kz~wXN-nN8hmt~m*cAI&PIn060;WgvnO{<#Cs?fniO&9@OkjPm-VnJA zU317@>)8F#9wWyuFQBNbF0U>jr5^2$c6;K<0@dPBP2U@aYv=ZiLlfZR(>Y78pT^?` z3fx>z%C*&*tirJ2kky@lR<&3mpDzFbzuW>m7*sG%F4g5AWMG~)yE~T+QT@jwUwWz4 z4Lg(^s{sh*aCl6Cj;Ao1*HgtxHRfaMlY0{&V<5yiUvINvqu+ZCCfry!@2-AydHD&N zWL93M)78bh3!s2|UGu;8hs7<*%IfIqgU9IxZc{pqdh4@s5@5qoz}+{!OrW1uQX(m& zZy%eQT57YxhqgYtw?}#PGe@`6|4%%P85mxiKN9KS=%}>ZgImDO#Wk#5@7Z=Y{c7L( zY`q)6Umk!_fM;IpstmYG`}Vg4B7O+G4#Q)Av-Ld)8=}JuNRhz&>sG(vAJZ|myR2Cw zB;i2&vw&^qbwYDZ=(VR8$e6Afji9-#23@$u&JA`pe_P;cyEUKLZoBL@FenPeq=KhN zx>1fyPCh)swY`;ydGBwcqC$;2m#(7%d?1d74tv@7@dj-SJoO;_?Z$ z3D>)^-EqZIH_kbkF&NfZ<= z-6ls{k8;xlKyK}AJELE$Be@h#d^m>#EF`mv%dlluEiT-ZThV`r`x-jLpe?yF? zpGx!T$_jvi9RFZFN|`L@3Azt3WFXhM-kPuU@n@l-%J`c%U@Th|Ka)uKmyLWT5hNjD zvog-l-FEYU1o_B7$TnIQIuMTekFE53>6itbXSC%Rvc+in>wKApG&cYAPnZ+N_Za>g zW@g6%3JVqzfBq|FYXbuVmJ;p1r)vTn?{D!50QV3QXbsr4sp(RZXICN_(6tCk#%qoaV>!ouO;m7Dv?9gGCG&*hz z{KDVJjG>IuLoyQK(AD28qzT%i=n_Rtfs3m}PWo=hMUoBeK1V>!*5#T$fv=H6W0@^GEyGcOXK9kEemy7k%ZYFo46L{)Bae<5g zfPGl|`{(EHuXn(9(k-$)pvEsn!^+5bzs&tNmI}mkh7a5=mSg@c3lcAS(Kl-#;7Tg= zzFlE5Zqm>I$VJdJF2K*w(E$uG)I%Zvd^UKRiYPZZr9hCnaOIDIzZv~h2)IL_w<{0> zyoZ2tC`qEIxHI8J0@+;~sX1`xC_mF~8L-*gPH*EEbp;mz@n>~)HBgwg zaE+7+@bd$Ye3)f=T@0w5G%Qc&3=SZhHD&aoQc z!06OZb_#Eb-U#uxi=G3YIiRj08GgT5@x{pfT?kASmcN;r+f?seBui zskOCrqbHmf48gVFbWV#(;xIf^F?jAw_vj9er_+W5PWi-$b!UHx4uw%Hp*e+o<$zU)Vd(C#k&I$K1K})&(1G2^|w*w}Z&vSQs)mO7Rl70*gsxv!` zEK5nhvi!uPdK5GcL#flHR&FiE_#V4i^h*nHuoJsn_%+s`nk7{4^@ll0mDHFy4+e^uE5I@ zvcM_m-^h{kdy+z4$17FPv(_^T^z%zw%Z`GfR#8zASaR@qYz?{s5kHsNJNP%b9lThu z!*Dsb?;qL4-s8SJp)ZoIh)8_COs0b6a{bw5KeAze5bxQ=#ldpR!-h!sa2mVa)j%vZ zh+vd!{{?s)gg)yXt|*&-5q$@yNLdBo&K?;a7N*>drBW_4U#akkd1nLM=hj>2-|Xxz z0Ka&5dmAwb1$J{j_LHTSU}Hzi6@OPgC14X2REnTrGr(bk!;keW!T=sUFe}ToxZNBB z7Ej;#<^-8^x z&G@G5rQF@~@%9+M&pZ}=6wSzg7jG0 z6a}Q-e`A`$rLn-2M}2FruDvoTDwTX@SHb)_Ha5nPOM^pgZhEZPYhrAm+M#>KG)7zy zR)3>A7;UOhUL>SR^j!o3dOE#+&oA9h6cm&|Au+K+;5wPTUGJt|c|{L8K)}`K2*IJ} z2bColG}P1}^TAi^Y%40ATu>N9BV5pp+9~Ec?myI z^t;ZXd?T606hs5H*#~^&zyYzexESd7s%5;^Lf!U-N0cS*+keR_+Ujby_XP&#hySsk!n*R1R5A4R*P@lI+6A5Z0+2L ze0cp7mKw8SI4U(j2@~Ld`EWoogz@?D^zR)b7gw`ksoMVHoIAC_y8w848yh9c*ETjb zn3%&I$bh{Q966VZN>F(~9mM-a(?(URMr8|{wB&2%-5|Yw&Ccgi3 z?iu&qFAT=mu=ieT%{ia?g>lw&GzE$sk^ZNH>t5$J-#)P)r6u{R6|SEHq1*KBfysEo z`0db={x{ZQ5pC}Su`!p#!^Tm*gN+|)<(Nt|pixq*qS6TeL;_`N`mv0us@L83{IpI1({4LXe16k zdkm>@u~V9nelsaiKvw}59Q+nXxVo`%wc3{-IQV3k*_urMAKdOP1QLe`=A;zg$Hv@Z zwwlURksY5~hR|emB4aZ(HIoGFT7Vx9E_yIMlMP(7%Y~JctV93I8r-eGxyh>b6EV)& zeJGxeAri|e{Bm)sT-r6a-$bJT*6MpZ8aA3=pWZ-G5OGTVXxwqBiq^k1`|f)zj)4AW z=ENgxNMb$HMc7=qSz)cY?wC?cB7qs@%8;;QaC|Z#lc2-~0bUN+cL$E{2xndJON_{> z|AK=DHgv|SnBwAMwx7!D0_7WTi&-%f!uQIMET3{i=j${~Vq18Xc}mHREj4;}mIvW- z&ab+|bUX!2rUP8jY}BKym-kL*T&gMlCQh%u-b%Tu7Pd4@0@8-12{3KgZQU{Rs!EjHl9C7$)I0?*X!}bg{-nRN`|jY&W8wP!v5q2} zvX3h5g-r1GiHT>PG$8n^#--+RIPWZg@=(7Wunq=cQ}M*!rKhFUkG^toxkx~vgG~O{ zOpSw?nVMSc^7A|3w8drCPKb-^7Ry&E#}qG3uRJ^iLUkae-_Y~)vIVJA*N6QMzo|jqYKYtYNOvMB;HlPBB&Pj6$!X%}5 z)U=QQC$ACDuqevSd}9um=&)bHY%O`aX(`1G*o9^oVyuVSBfh{8kJ6|}R8DP1keY}P zLk4;uHU;cZhWd*d4E)#W8QSaC`K4Ne!WwS*DeO2&z%N{X(ehn;oz6whul&%Q+V_Vi zGU2jm)_Y_2OMV|7QnGsWlFYVWYjC9|L*&f5*BWtMw4ov(+}8d3@42l{jPbHEj#+f# zZr;<*UkYG>BOyLZ`2}`h5`w{f@U)8BMwT{9ji)DdNO{&!T2h3ltLr5I%7d8I>bKUp z(&iRRBG?I9Ve%R5Wk&Y{<(iwAS%!{E4)0D=;Naomua_ERQ&pcH+%8861h@c;%yLg~ zCk5v%6bM1Pmxqk;~WDQ!y5E1!uiOD4!xMPA1IU z9@|d7bMQy9vD8e>I*bMJtqT#j%(+yFvZc8yF2z7Z(8+ z#U;YDAF?6owNABy@OYf3rP<>AY7AD7<8yAf*8PlKUdZZDdmrpSB6Wl<`ol_($r(C7 zF^JfE`vzhmY|pLc6DJbWXi{F8R14}3l}bpZ2sC7_+s9@^#}P@uGPZjxQ~c&7--^1b z(*1jXLx!qDLA^-{JVc-H}_hx%g;Cfg_?@uA?Yr$j+iObUn(x}2hW2jf5^a3n&;ligs=c|8mu&YelX|2dfh_Eto+-TYqW~pK5E6&)W zx@505Wb&OpH2>bI@tBL{7UaI;$nt>r=&45+*w{w3-zb__}(kA0o)Z-S{%T1CZhl>_-7oewipu) z&A|4;Gsr&?5mpr|k<)e&B+;Rjfs#?HtE-t!He^dw2z|e2%BBGfgwuAF=*`$o^b2f2 zLf{HSc;$O@Mpc6-4q&gkI$-gGSrp0RBaK3qe+Y#HQSH)F)BKlQ|7*?gsLgu9eJejB z1*@X2;dP9sZkOKp~KRE%$7j05oNoNHmWuDuk3z^O3E9uribG|}g2wr3&7 zZMfE9xMI%L&-gT$%>b=AxTKy8?a1P2MsULgwF+X~$30%1Ma({bm{ z@YwGm!6}ceM+L-+`dnSvf#D(J=Ol&+Y*R;`Z6#)AGGXCv^|ky0y{BhseH4d_WM>Dt ztIIR@umg@6$cG9#A}X@N!7t%{O>>bYx6c`sxAN_Z<4_UspvaHYY6%NDintkr+a5tF zDNs#nVN0csbQEC=dH`_AK?!&q95JvATpWR%oo-bIx3{;>yJObVjeoj@0|AwQ$MsOl z?C3QTV0S=T!jh6^z`z?cue-o#4DAgsxlYw;N88nJVF?;pI7i3Rc+fU@&3g%Oj$;{I z$8%kRms156iic8aa&o&9c-awRva%yHGk-yO2Lq2r#O2%mG?Clfg$h*10Wk$&UUi!+ z**$cy%^7r7V(aFzEw|UmP0WSl3#kkIfu6WzL4( zr4b_{e;Be`ddq8~y1cPO8aE9~gNU&=_0l5pfHJ(L=p?2a_|We*{jvX6M8iZM0qq@$ zC?m65DXZ*32ECd5G&es(sV%(+GT)i)WYYCz=uQ?ESO%_x48VseEtYso*4tBK}~hXlOfhc)UwP+^-+ug>@9-jdBs zi0>~qjZgxe?6NX?$-bn-3@A`#3OL?hcSFGdM*hmm3YfRVyy0(k>Wd)kf0od$i)HyB zSx>-bURYAn*4!*5En~OT>H=(GDC^b!-#FUAg~+6t)&}cXB7sq^*b*r)%ibN!T?5!z zpdtaVatYr85)!8N)hp26@P(N?E0o-b{v~|^VZ++W3BG~uiTEcA7&4Flp;S6k0l&;P zjlsxoo-MQV8skNPUpy{iA!v$QqVO?w6mismjV7K_kHE@jbeLl2I?W zr{cA3y36iC8-rbwfr3h_opPj%s{2nhO~$+oz)D}J_I>CHeI+l)9XpH&8p;+5xjSr_ zm6er^b}P3RBY1`+xdi~vXS+Dl-@gOcA8{DN!%z0`fcBKVGvRlzgG-3%^mu;*U_ufD z4#Whtq!tjx6nt|zJ&!l%PLkj@2?ccwTSH_FJ5CVjV)ReOonJPTew1)`@7~~f(p~b~ zgh&R7~#D&fSc%SLbIsv1X<=aQc$jbKiS7#%`M%0|Xs88w; zq0wgQY|g*<9W8UsbYpG7RK1}nxGV{s;AQ&}p4Hv#Xc;4* zgCb>V9XT>$QBo|i)`E3+?Fg7c2C2V~a&QJbs5Qogf^FP@Y;#X~4k$smOMPY8W#Q>F ze&0&c&>VUA9*E{uEuA?1nXWj29Va8@q5Sz5m!G})TqrWsYyIkPr5ua$&v6Y@J9#aZ z7Hv&&%W=YD+O#12bEFQL;d2s$QeB(%7yQtX1}YEzyIe??bP1c7CWpMfx}Q$gy9wfQ z&|`8%{fK>iUbUF*?g*U=$B=#!Nn{F3&q#lakfh9;+CRtyJrb5L1}KB_g%>YRyEZQ1 zt#+mc@GP~{v$zVQ7%+^4c@bo7VQj^dFL5u@d*PfR7Z2#)Ovx;WhK8^>XvxG076v*= zh6w7}>V=eOD&S3H(25)u*_%K$?CkXve5(0kYI^z^J!iH*1JcTbRFc!VzP{}r!6>LL zaUX#I9JC5GHv^_1ZNDK>JR29D=}oJZ$w0d8Jh$H%^ipdu<9>GPW)%BpAj?;Y+#)f} z)-}ktt|)fYViy+bhN0DkNZy9iVoJV8WcRECZu zEYS!C-&2|dltvF0UT@hGkGJ{4uHMYW9{M@ba0?K+y0g{Rd6Nfdt_gbmo^Z<`^&IS0 zvNG@g!+h*eEH&wxT2XP7ErCfHgqKcQ@m28~LyJ#<7%gnpskSECx(b=9>-v{g2QvBA z>bDGJxEAP4B_L;V6Wa*Ieb=*$?c|9?`SbWoRr%9gUvf@ z@?A|j?tE|Ax)WCW1TUs6>&)EQMaV5D_xU9F+Vq~<0u6bV( zt18_{t}d2Fnw46Qw6^2b-v-4hd}{#6cu0Bs9_4C;0r9MYuh9f)>nSmQS`+nuI0A-H zCntXmnynVrW zInWF3_ewmA`Ui(tnbV8(;*^QZuY~%oGd79Z{^c4D4dGx-$ zQ@|pcn^iop7tCG6Rk|9@fCeM;s{4I^0Xy$&DlvV1pT(K7S4V5FP?0>2_FKIdSi$%U zCo!!#2!`g{uPWQw5-;6;HJxCo{U?Ipl2@WfJ z#Q&Pc(^MM-?g>#tyUD9j)hAgqYQPJ^z03P=1cp-?eV$|n&CVzu8Dr-po@1ZBggqs_!;@bbJwQRujq9>0N`%1<~6`nL17}pw%qK2PrHG z&E|_g9DBN~XfG-nuigOKV(6~`F#tie(4sQK;!8;nEMR`P%^xNu{hgcy8gM`KKGNp1 zAPu89var-Q7ya)2`Qjmk^@Z-o#OqD9H)N{$0dV2R{$Go@`+n;?)$ zMoCG{tkW@lHBcp>uY2)Z>u-q=T~hhb3Qg67frme8NqT`UhpkXla&SP-*ozJ%EF7y} zqhmvWLlNzF%6Hm>fWQ6K77UY;n`!`QL=m)?-Jb*iarP5}K+@A^#rWvH0HuX4ApmX2YF5zq_)Xp<})hy+FFQ_O~iQoB0cp z!l;(;jU!YP`s^gStTeByDN#}JJ}Su9%JS$pk^;enf*!RZf8NKj?vatj&wV4y03hi& z)}JCw6il5!Ov;??)vKINp!|DOsq_VSnl&leuIK+;V0c zSwcg@EF-m-k8897kJmtv&s(y;qAZdbV7S44_c!_HoP~_& z;FvrLazwi~UEdsDrxURy@J*ihLLqMkqflG?e)o=9`DfW*7EzYCWPoK|_r9u!sO`s7 z3$(XC=oz%OwY%fxp3LF0fQNy`m^t5QW3|-#jcKbY3!8PSH#Dn7T)|TZcx7zu8d=C4 zUCG6_;a$G-g`$lCQ02y-1bkW|;&5jGn-^Y3TU}Zj6j9q;NyJIo`9mRB;c-)Zc2p2X zosBA}JY56`8b-u~M@EE4#)J<@$k6WwR$up?gBEVL`=suKGFNHv&3J~9Gd z;)@bFjMXGG=838oEP)_v z8Bpz`#U2(61?%!GB_%(hs7?=#MBPW38B8d_OJ>noEx}Qrp*RydIvzBu9ywiP40d@3 zSb!x_FCH#nzi5vuibzY3PbILNzMu%0-@YHtoli^qjK<%AH-#6X?a;TzaJwXYkD(q0iaeO3jdWF1X}d`zPFy8(+NEYSI^E}+z)Ty zd3l*FmbMmR6qpTsq{3oB=M*!6A^`ZApzJE1HzoxiZ|8=pQFvX0KLy@makI`occ?K+mOg?^;a}s*CvtBQJ_Z>PgQQYYR+s1S@xCD_hbO(VV zfYZ;;rhp98z!h>+$k$Bd04%4prPY4)8%>iN+kYxFzj~Gmku?I|F7q6 z_xLxKMl(}W8o8{U?d>6%eqFArr6sQ5BQgUqAj;K|P|JFX`^psh6gGbN8lW74Rg9$m z$)J$}mA^d7WrQ@?LT)Q+Fz*MfMxc32o}ZsUNI2!`=`asGY@d<7-C|!~;1d!eJl^)X z`%YJSJj#~U0w?aNIbb%fCv>(Vivz_fA}k!QI7$E}1Z)i-`23*dVa!xZ?0otsf z)Qzd937+otgKa4-IkL_-5sYWc<=qKb&E*M1QSu3W%ec6V_)R@!=2wZmyjO# zcDZiDfco0`54$See-^xgf>Lm|>r#LS*(FXJ1&iFE8 z2jEn~nEA`Y55+X{baNH~G79@ogZ>98=&OCL7$Gpn#7S1tpe}fSub_$dJo8c^=TQBL z2w1ZwoF2n3`wcmhCJg8q{`0~eRq&vLXJxGgrbB|hCm-(hzF;tC3HG?dzBuh@=hNi1 zS5Yr``x?mU%81jjk+Xh(&YsRXR{*Rdz#?Ih ze%K@N-H`)ervL4ab+2j#zw+tRuk4*M@;chv%RT7hoDNA+jvc2tF{8{?;O&-oDvGJq z`pK%pYmSpW2(oQ3Mf(Gq*Rsw?*t?MYU>LgvPeJ)nUrv6EXqQ01O`0bH@Iw~yVV@FgPx2FA7Zo`Ppm@#-?A;hi{ezID zJiJ{Jo>vJ-!KBz*T`D*|qaGo2$S(W9y(`^z(Wb`QjSLOQOG-0PQmL>3Y$8zC@AJ)# z_@DoQBuR~d$<@J2!phkSyyxul`5@T91aV?8zK1OI6^+k*+WO88??C)9TcYuRRIJUm)b@rKAG4!-SaszQt9)JLIA!mSh{4vepDc z)}HoFb#(SaE-DaS5g?&{2LivkoK@Dp+Nla(Y*lq*-^>opr{P=dZUaR$O?6chIUN58 zRr=dmX{dBZkF}Mmqo`1k8%ay8m3uH2B$yCHIM$@`P_A#7Jvt`+_NDt2vSUs|Q*wRX z+ae%)I%WY0@^wjo;f;aeavT3Ulk}v>wecG3^Q17WmV0PEDXY|vxVR6!ls9^^+Sss~HHMs=YbMfreho!%>3?5Kw}E(9JZBK))cKp6 zfH%WKKmdTaqOl}TN9=--!42;sxl_O#QwI1x`{irG5zREPg}B{_S*$+omCsbCk#qxz zBn?Ib?%k>FrH%L@2e6+n{!CMG(}o3|MTY}HW4q;g3etBGZnRd71=Eq5B$z-id@QSe z(F2QBH~~cKy04@JKa@0-#bF__h=XM8>_JWOqKSD`lI31TUDy2-eb z1gmCfF~RdL#Z5p`SO!+L?i1K_IBj@YjzlQKiXy`C0ACZTr}=yUE7As>(bGIHvkX%S zF$ukv!zKse_3>5B)^CaLkg3qndANJa z^~hU5^KpOj{n3hB(i?4mZ(+8*gSUSr$^Xk1r~_38QO5+65Rk}w2m0QRYYX*(8KGF2=@KL?|pgh~mXVB#qL%{n2ELj^Qz(o zE+1Ql?X;KmY2XYWu{9Xp4uCf+}?zb4AUD+2lS=E*_f!x8XD%FLJ;0>0(;%4UB!Ur*nl%?K(Rgcigk+$mGui1uB0Yqw_Z(9 z?wg+u1^R^*VFWyU%XAnE$R9#fgTGkHP-?88P7}cte3;|1@Oe2*5v# zGJK~*pA>!S@O}GpWY8dW5<(HVEAC7}VzWr*&V2kQkZ=SOARV@MC_bx+lRa&d2S@e@JIk zp7VR%TUM%*Ic8B&lm2r}k6TE5-e-8;>40XC4&@ioWIqNdsZIknB?BLtY|WJlCN%Ut zYPqHbpBWO`XSW^HiSN9nmcfA98-`6$9BAu zvbo$gqKs>6Q>YhuvhNi!0Z^ysMePKjZ zRaW-%3mI0QaEGrCoUJXsX`fX4W#RRgKLn7?M&mx?MxV3uwqqWzkcNvZBjz)&jBH!v zkzkY^hi!%McTSMz3+s*`>WG|CHzBA`I633}amlQkif?ev>8Mb8Bfi=eeAVS-G5U_> zgdon$&Brp4!7UjpS54yzbZ$@jd|Y9JWK@@RWAsBqCO%#nEA5^NvxXT*32b^Ndy@2mjY?wjJlHs`CpA`-V00 zEupqalvt!OelcRual63f`~j7eZhfuR!z9*jypTg&?l>oVGARL7{Da0^|NZ)B{%gu; zBI6)CVUw1pb!EP6zF=LEW!DiPS=MVgt-t#!lBv1@PhR)YEl-; zM6oGhi)OD;@$W-_Oc&0HCW0%4h!1}-K@b1%fxEGds5HeU zU^&zbz>lj!mi|osMp!w|Qk!5HIikZ>cZL)FXYQ?66PXf>4!u<^YweknlS}lNe8WY$ zH4zbmjn$rJAnJXW+Z)z`ho{j`FF(YR3LN088%~-zOx%vaHF~RIC&w`)Y+i|s4C?$Q zU_9Ip!Iw9CJm_bXuh5mZUyH?$6iwXMK&PTbyf2TCSvla!oX_a$*zoWFy8DSpO?5)c zKm!de-A3;r)Y8xPE1}K4U|x|K|8g*?O}X^}#gppNWeeu@q4`NjlJMl^^S|axc^r49trGjWD5xB|IM6@)BC3%3jxY5FPGKcKh;j zRuC-WxAsZ~Q*mWOMGXUChyAPgq&cW=f-4*f1l}M*&kag4*yU7Pmd*B z*PqI@Um3QrF(0oz{dHAtBHNdt|Nb+N-;F_JtD|!F-5d4j%B(}{w0CBeaCS|xT7XUHC`9@2Z37}Acp#uNvd6;Q(d)K67Q7EeBRrt)hnXdL_N zba;k>8IBoi9)Sa7*L0je?)DgcIL7kEvZMtELeoxQd^KOEY5lcaN7Tp6!QXwH_%=Yc z=0)uVy6yc)L7nYOIm$W~> z-KBD^ysztVvASD~j(f5&T+Ma&Z7S1dJo_8VvVrBzDcWZ9hMmCpEkEypJ|s1ym?M5B zn)E4LAQNsOFUhcc$Do@Y7q_@TVx*!>{L%US#*w~y#bPmqq==?dLdv0`;l>6lE&201CF*L98q)U^wDg}=Z(wK)mlw|0*M38eb-8O{C4*buXw6#h zZqm^-y3QY3)_L31HFOE{3&rz2-lI)Hakg3oto(u_h2J!P>Un!$mxU|~ajPyeH$Uur zI{sPGG>&lHS8{qKuBNWWI*IONIPikeEmRbpm98(or<^;f}i=Wag&~>N*{*(2Orp^wa4iVu`|dt|nnz`NR$N z>#X^`s5SiaS{w8E{8%hLKvI7oXymJ6PM%=-aWJlM_(%$7wQOAQJ+`MyfAxwNL#9!r zX6C&D7X##!1$P4S{S(4~-(?e559e;&9e9vq5ZYC@qjR+Q(HeC63v;}1abRi zJty74Q!`^~v1@L=t)wXvqvoW6`_+ zbYH{iPgh&ktU|>l#XJ>T-7E?#O8sLnG;c_F>?Q7=$cWRAO{diiG?-ZUFRa$tYCFU2 zCPd3>r3%cW;9t(zt{EsRDrzWi#&1ng@Nm2pAPa1razF~dUkiIdoF_XIzNIpK-GPX` zCp9GZbocWD_WNJml%-?Cl^b+_YCUCxJlziSx1Ok>?}eww^>F(lt4KsP^Vh+qHF)ZB zY=c*|sOsGGY`XpAHV~!5U!YnF>kZ-4)O zCxJ;`lc8KwtyG~ZDWSp1MoxY+iVkcjkC${BGfgET4-Ho2O&rzoIkeN_)PU zAjhEbpKdC6y5iRBPu!xBH|^#r`M#H#l^dm2EUPmrxUug_fB8CRP2KtR4?KJovQu9+ zCB=TGZyqfz@d5(jvEqln2Slzi%h)F$%p;Ii>?P!ivlo4D#;~ZButE5 z?NT1O(TBIDt?-o(6MoMuDj4sMT-2`2ag_-b$GY!waHg9CJt(gjd%~0QdskE9Zp^=Z zRPK@QnfrDl9c8`XV^)QM5Yp;aHFxM2trKIr@J0pxR9M-tED10hUr(|O7vCH7KvJ0n znRIcv&76GkARA0)Le`FPT4SIS04XdyaT`@-N?*$WeJ*l1`7x_9dODt8m2EXo?p3Q4 z)_a*}3yJ{Y?t z!@r+Wllv`|$Q?x0pW}CNlAF3oOm6<=8%JS!oha(-Hykdo)2wW3;yygLe=lXnV_F4K zJa}&Q7w;40x29Dwr>|9g=&)1JTrytP*xTp#J%y+VAQM`WTi`*=%!A+XrQOExT(^)G z;#($Yjg|hYpRrp|nO)S8HWZ8=7>U@d!ayhIddqhD80h#)P>Dm8hZcsT&Nq9GzN*NX z53e><6?tRVt;=mfVSGHh(RNKzp*tmjJXb~L*~?O?9P!yKlA>ULi)PT;3YTXOH>t0k zK%7EUo|l(ORFr^+M_pF-XtDfwypv!w?KLeni+?+}+Xg)rdT?Oi^intaIiU(Mt|ILj zT~aPeA*-*UI$CFPmg+KlCrMp;I4ra?q(uMgWkXo$G|p)OII ze>Ju|I-2Ik9NTi4a5<)iHf&xvG;#d+W=%KvZ$@ulKad4?Uv2MGxf=5a-V&**Ohfj)keAoBdhFpzrw7uV7n2(~Gp%AuzXYz?goed8pVHxEhNMB$7g7)VPMlAIo zNVo~P?XRkH5eP%A?*Rc!`(JbR0%=<<4in%0@U2Q3vkjUd`kvPPVexHP z>v-a0Av5a#8{Yf|-X2CY=8kBCGbmWFXiBs^p7D_suL+|EzRuIGTS#*ztucm)9zFF# zH%d4f3d*(&>I`R4OmHxjG!*=RPy4A#+6eTASSdYn&UX@2G)|XrLctTcyPeKgWX9c! zp;z&5J$pUC{>xL639K9}?WKvudbp=_P$b@B+VMpyv$2vutZ!q~sgH`@KE4_2E4Jfz zt<1Bi5&%;|!%lB9VG(bU>1_9&@5VmN!AyNS?IO$qo!om}F+r~l%B;I-Vh%uHrnMi5BMe_%7O zbr4Hs4dIsu>ELm-VfU=1OZ=G z35k#}L&SKzJtWx0_BHhh1vrt}D48GwS>ACekB-FXCa#s0|0_!NCE?$oRd}V;Dr{U>-9w@GVk#mOU^X zfKj|LQxdJ&dD0mY!S@YZzvMZUoeccKqGBNGw0E$_W~6-LPBI1@7NFsPz%vJmGmIVQ zx(*Kq5*nHa+Vdx=tadle%@!%Lgnm6;3+IK%<0FycPgxtW{PN=R zvNAe$Rw}W0AN@GG4$mtSi&=-2j0V5*($a`?!0j#(DsIAazIbsXEUu0cEP^gOW*QPI zPJ9)9b7gp4x3vXM1&o}Q^MXLA(ZLS0AwVdd^q0@zi$+RC0~hQhtK{%S6otl>H3D16 zYJQQ^^Yoq$3-?8|`|?00(%>Mwo$)}ng=n+y!@u55qWI$Dl5N7 zEHuT?%NWdP_Rl#Tw|wpnsoCF9;%gkfXY~fbV!~eAqcPUHj8IcEXl+8aoV9c%Q;|jn zft`}wgAQ|^_}sh93)?>Tc*W--lg#!Pl`?{R1H(q+ue*l?rpSH%kNoJ%;I;+p}i(q6{|J~|MY|c{7W)f#er_O!6KBx~yK`T%Z`TS+ee* zWo9H&kM4rVWr9c*GFngO^(|*f>qY5CAv!L8{vW&ZgU+w^47!tHJoimm^1PrTkrq(! zQ8P<+N44vA6QS^6X%z4+4SE9Fj7h#KXlc>D4wOI1C_=9agvnWYLSm^CmnL62 zKkyl7=T=8A<92!^aAW(OQ_jf6>$2AbdGF}AU9LR_N`}t|q^GICd*7UG519zg@bF6K zjvgCn(k0*xD=dWu{t9YFLf*-(h&qhtR|sbz4Hgfe49i;Xcp@5xM1n8Ml(Yx)Kuw0ezUL@6j$*!TLicvV=>}O(<36Ws+@MuX-zc(?5 zqy1>rU8T|^Qsr#aVHd0#lmSqVsjBM0Eq-w4koI}M7a`w#9)EP*JrzJk%0Ld{&ZNOn zkhaZHyT+A|Qm*-oztZTFTgNudaB;MNq_{C2^|`am+JuRHO04YfU|H$MCY$hyPsGRP zw-*?a9b}AyMKfQ%xUVgN-g$XLWl~fj5gNEv06#c9{80TWCnYfOV$mf91^1oNbC@%u zme0XOUyqeYMbe~==u&nMx5=v3$P(?qqJFROdWgS zrWdZFtz6frF*7?REgjx*X85>;G9}DTCRD=6XXIq8tPC_#vOwdy>~Deg zbH*${qe%3&okjw(E+tGsCzwqR?mDJa^rn4|n0YsLt+=Z6XE%#c?e9UOvAOSNe>EC` zeR-CZk`WTX%j>p3na9TB@3TQ6-z*EM-~3UgWtdRzpH*I7nQyG%q%!4r?O;qsUo ziQK8T`g;w_J=5FYWXqWu8PI|Y3@`~r&_2@T&>q?g9xXgh^yOAAVsdIzALo?}^eKRx z5tsaMl`)1D5JBd@y}zlP*OH5GW6b5SwbJO{r$c=`K z__$z_eiKU2PLlnL-z!&fF2Z;90n7Tq$Y-epA}!*ML7a-@4+IV$TZS$*DF-?GKl-QQ zwt8R^@j>Y61WptAb)?9ZFoW%X291)!w*)ACn3XuKh-Bb4#xvbecM45uv|GybD;)a% zJ)A)ay7^%-`tC4;`jieL3$3(kBfZe%pRs!sB~B--vf4f#6RkcuHD&iRApABYDbg*wxi?a

JquC=K0@(A-p>T(WipyoB9+qN@3j1xx}$GULpc!CEJ zsM5_m+sFC0E&w2c{#BApz?x_&?nk`r#y=3?O7b!~>h_h=Hv@hk33j@NMQ&DDfXK1+ zajklB(l81hEx|Lxal1GvBrBLUv*aFI>Z*BC$TwV-^S}7MA?EHCdbE~$`N;3*8}+3` z5IkK9cJ{O0(-YCqu;@?NdFH^Z`MT#N=wD7LusycGqQu;<#}K6wpp(cxrWxYE8xCoC zog&Pg2PU;7UehKMdwB4O_sMKZg=6CcZIu-$2zQ31G&v>SZ}#n?9t4_uz$#>8a+{sC z5ljny)gt!0<8;>6iAdw3prL8AS#*G7{0d55ikLgofdOkCZjcw74-EL|HAdCE^|!?E z*fv^ysTMrwtSFP2d$BzAWzd6dlI5)iLk^cci#D8&iqKdXpIz|};oUJMdMMdcriX{C zwRC^lR@XnC*C$3k51U74l4X3wXM^yh$>7^|@i(03C4fDWT{ zc5a^2g)D~($<5CD*<3R_W%lcU3;&gCqN;`lJ8L7zvq>!)yn~B7OcB&oK$c`@XP}@! zsQuRxGbf_`-r3C2<{`$hC_Oig45RK0WwbCyAz2Z>Yo*~wKX0?-vo_dvbKt)K*r9l z85u9`K4Imt#Q!CEl_5Fm`JL|anvz&An5t$Yrm#DZlsr6oG72}La#CjGJ_Ul|Fo=_h zb(*VUsx5X9t8**+FAO*>X z3nppGOiTOKz%oE{kH(nw&9F;kbl#`!c-8M)S(!As_Pi;LVmca30v31PpPo|&vGG$q z{MT4)#`7OcalWS$=a@d(D_yHbFv#wG-g#<{DBekHg*L zCnESjy^t@^JwHU?8d$at(-SG-UdTGN4a$!VvADS7HOq{KZ4v8f>N+I7DkFk~nj*!t z!TFLYl5zAOSwhW+vOY(*yd-o`7rpQ+7y6`3mK=Bj zv})dzX^}~1`|A0*h8lA0e#A}+X!md}g*1xIFdxh$^lTV1>TQ>-~UoQIzFz&CpY7VZtgewJkc%(~1%fy;Jwsh3gC@~@(x|nqgkap;waf4LAJeT{kqbKjEHH5%(T^`G9E;Kc*{8>$g^_C2CqEf-y3L^OQZsSZ0ls@6% zWs_nv0I@2ysQ+1PzBuE6nd(auz0$$m=8dZEl7xuMRGz3sV=AIrm4Uu`G|T#}EK<8+ zKKDPosesYArHr(K?@dDZroR+eP^15iGr_??!|(CKE~aGl3lz_oP@OauPjYU1i_zfj z1TOAh@6xH@fxC|e1@Ks3o>6k*FOdaHkm8$RL0zo;+_WU4{rx*FhRCeLH*g^+98xGf z&0_LZqf(5LiABN;O=m%c@MV6r5VHseD-G9H~CZj<9C?z3U&iRyEKnKOVG zFT+Kt2&Eva%!-uJno39%!?q8Pj7;0CE*t-{SwVX<*p0@ANRFBM;RV|cJW{;G*l(Ak zLK`b&I4r-rU!iMw@E*+wEfd~PcOtMD;j-^O$c^CY^LGd{*@45;d`ahi?j(!9XYtmn z3pf1)1ABB7C1m>toO82N2{=#yYi2>6&y@e2wyd0SgkpMfuq>!c8ylk=?G}22;AFGB z|C`EhB#0_btZ(#q9UiMq!LKL6%EIymyd95oKh6={U6L((&wbxa8jteW8FQ0`V?hqE zNJFr`u&Kj0#?Ugel((CGrcZb$rCM6}7bHUPuIs|+&rj`h9=lYf`H5P?;xCYjOfq2z z+^>9==KJqrg;+H&*VkZSqk~(Z7PukJu~$Q?cn$Rpx;z|4O*jO=Sm?H4)PTAAaJC}X zhYi8lpX9x1EY_uks;{5cN7{uF*cc*`2?2%Dyli$49q?#cU|&lPXdIH;f=Ec>aL zG0Z@*(9ElF1_)hnQg96|-2bBMtE0N=+BQ+T8|m(nmhSHEP+GdXyBp~aX`~yZTTog` zTBJK9X7fDn%zW=!Gk+2be&?Kh_P+1yz9Qt?5~z;t*1Lts1Z!$y@4$;DZMQ^mRZw&O zR|V?Y%?=r^T;M^;{U$vcYE&Zy%N9cC0a{s796{(H z$jasAw177`i2b<5=p*?tIvy-W85A)wJk5`T(lq6J^pp^&HhXi;3oZz>hl2V$dAk&L0ymt-D8DDm!feT+^V$U*PGDg>+OzXXipVgSeRXNtz^id=7r^1AIP2ZFbYGS zutg%8k(QH#hu%BjAWX^J8(=S4?(*A`p{UE^H5(fOf)k7vh$%sk*UpD=X-aMKvuQ&Y zFP@IQZ?CvD?F%Bfa`BT!*zQ4x@gTn$dhmp3RiQ;?5x$q$CG+2X+ou znecl{aLqIbF0-&m&|-QUiDZ{|rvJkB{fJ{8MRj(jNA((n|NCEhD{>ftUbxde!}ZwK zGUPN$2HPitU!6lc+lQk|rG9Wt)mn$kvlNNx?UE zH}L!Q$5KM8(s;YsS}-F9Xtw%YysE+BcpW;HAv8r46hPVc0(+`62CBcsgZ2jCPzNR* zvtvg{G}Om>ozUp0sO`7_G4_pW)>p5lm6TN?Ec!MBIU=C~X~|S!ghdmp$eRGH+?ENi z1RA~E9#Z0CH5p!r7H5VK)j%+Z*e)&EF*6&rwzA?ts~bcn59MMiRi{o;BsSTBb_w`o z<8>KVJ(fr&E|12E*0NPQp{J#-0HHJ=yj-WpS_s}> z-hmLl9^w`;bH{b)rJZcogwF~1rU<0pO(V#L0?V{z>)Ej*r$Rpjcj5P*YY}Eo$(Qn~14Yje~)j+Up1b zQb0q1NW-=*CIVZIT}-S$?QLNG_XQ0JpNA&{JbuF!84@^rsEvB!`=P_-96TJDR580* z`A8I_It~g(q!w#4Dsq^!I{)f%1mx%kXGkbynW(pB%(+U)9;pu75J3?kDe!tvut`zc zb>dPIFs$wP2#QdZ92smKoXwtWCxmrpD zg@AcmS2xkaGP(ZDBPb&`5dt_v!FHke(EJ9_%0BGdMhdI+ED4y-~0%#Se7J{sMED zo+7>oXi?EKmF_|{vSRmWR4WBk5KPjhnBJ1eujZs>D=JE(f`IeGz`#&~TuPEeS9kSJ z-}>ZdMb6(d;f$6TZr77ou}FBwzQ;WRIfi}OUog`bs{#AOQf^$!Mt)l zW5Ni_|J>sogl~4gGr2yl;POw*Sjb$No&8ye}Gd0XD?qao>_d|2SCBVym5_WXg{jXbx^#(>6W zkc4f&ywzZIV#Q`3p=jNlR4BvVaERoQ?Ln^hz!Aoj>DU z+q1J23pFVUQK))n;qAeg=lOio*a&nt3~$Z%<_{Nf8|FqHPU@skKkN69t3wVto}aHv zwbE(;aE3eMQ|70Xl;YaYQz0cgpoTgOai4E;vU&}zQ!k*325UY4B;zFx9wrWRdcR=b z)QO(~fpZxG1|u@56sat!LI@XmIQ0Ze1Cg7v0oK$YSL=vV>*xxS>NVO+|EQHcus znY)k4)8jiDP7Os3Y}IIK`dAp`?YY^1>n6HXlz#Z+9d!*sl;hg@S;GZaIAabz*Y5=U zmlp%9`+@caCZ?ulsf<>l0^%nt_;oOB#)(ny_oTLToEymdJd47uZw{+hi{2T#JiQYx zs=oOAA^41nE0YV8wE`J`V`zF1(YOm$BkQK4FfuYb3W2{o{!z44kp-7eq=@M`FY)P1 z;#iZFKjExWZ)@gLSzUyRyA`y^h1t)P?d2BvU8ba&X=N6Uf!MRpN+Dieqzat9C%XNk zJ!IY_;V7!^4zol9Cyfh#ejo?!UbH6S5Ca1l3QL95!X9rbu;34$pqm6L=3@2Ip-z7kgBocn416Kg#gXysyUgt`0Dd z*T-X`$deF=UpA^3^pYfFj>)3k0+aUOJJm1V*auG*49KAF7r?2yL8*ds_)GLk+1G4B=v}S zt+@R#Fmb?74!AO^SK;WyjIoenC+2NMDHGYCOi-sXVY^&mkStyECX?f>zr(sXBCP)T1)aeR4vnMY&OB#)Z#-BxxwI;UOnBO%1WBY<*h61) zse)_kFANm&R#6HMGqc!2mEpIX8o5vSkwWO$>uF%`^YHq%jgSdAw<4Gh7FsFD9^|=t z6Ua2)*MBa4^%}UfGk^SriHRYtEI)FsF6!);nDKC-8#Ovfu+X|QSov_$BPZT1tj^Qu z5cI9g+C(v|ebG#+%iGP^Su}-ViDIm8B|thOARrCYL4=Dv1bL?v{|!No8To}mZMYMh zGjOOq!7AjkLE1wp1tP1ccp%Wf$~gQ!iT5$-^L8Gd0v+{sHg!g4Cx34)G*#TCRoKg5 z7m~xn(C`3EnMh;N2;*-OtGdY=q`t!yKpSiQh`KBq%;fZb0z<4taYjbQB~B5LCNxs8 zZ=7~W4r7b!zBP;iH8nW1l>Jg><$W{#u>#>KtSFtp^yYHuhgs!fc@qo4_9mCtA(T4w z(thGBQd^X)F#nmm<;e&REOrNAz8*j5bDM?{eH1lE>z_wrCEZWP8W?arRkrgyqdyZ} zT59!3T`4H3UcTNX`55FxC6^EY9=r9s;|A-@0dcLg9;YY;sDvwpL-zpHq2f-@9Bd+r z&!wd{y7U$#*v(q$BdZq!7P{hwI>)(EU<_I@a_pH3QLuVJi-j{-(e9lKVE-`RzveWQ z;)Hye`09)HVQfnR@!-C^w89u*PJGqjo~EI!T%kE>bB=Y;ba2jBxx?^&2)vOma^!yo z&IYDCJ3A?60?Nq~q8>E+@f9#l8Gp{=GOdsFWHH5|jM@4BtMkBZaEg zs~>mcwU6Ety@2F3zV*z0umUybh3oWgbLb76^ z_N}PU_HJpXA5ptM$t9$yS%EnqWXns%YR=A#1Fz&BZ-Ay`w0#Yf-_9V!)grgr9jtU; z@O@nC+LMqN7MWzJ1OG%^No!fzs6|FB3&oa@aiU~NG^!#a>a_-YN>Ew7)liwge~#Oc znW@=~kWU2o;AdwPzX>LX6dH4~)ve}|M#rOjT-ncO>9jwEW`emDi<{%*_+;$mnTTe( zwhER%&gs>%--6F$F26srQ06O4ibt!Vd(H|F67k?LEflKybccb1hk<`5@r#X(eP?@{ z)RMQ+^@}y7&#l-7Am^y$TZO5jX{b%o;^AqK2#JhNvz&b|`XUCKv1p(Ez$;(rc`;E| zR?F9HAu0JOCv#Lm36;--1wdVx3{jh98Plg_)!x3uLWs)B$_`fZUq`z|d?uEnW1C;0#%_-z$;h5ML0bhS`yCnkWce2VdQU;p5 ztX%4P5~%+HCdurbslGNGEw8lZbbntco1(xM$EaT?VQ9E!@?+X>by<_WFjzKaOvb=E z8ovK|p{r;{YE3Imr2%rS+k#oykCL0aOw3#Act28A_?ueFCd42{I952}Fl}-FVlqYa zti>T+>Ybc8vx=`IoqYm(2R@f+AvftiNDsBFD5WNQ1t*gkHdceNF-I0h`Nr`F zHX9E=i^E+Q00hY_emOdk(>j?gv+H!7tAcU0W(5mtXRyhh2F~4Mz%;@Oa6_#P&I1SLis&trvb{z z{sE;m1glK2DCM2g%uMp5(XN?=n5eeL#E+A;U^tjZW&`U97eJ@!4XB#_^cr~i07kK- zw)W6MZD`vUmY0tJ``t z%xZBsN;PkT7kSBA8iwxJ^(j{-2zguGM@cRc@tu!`e+F!s2QyIM85DFp{QJ~LYPeNJI-SC|hSENlv~S-@mFsc_=L(Fzj}yrg94jpa z$Pg||X%a(G zJKa1vD<~*XH)D}bUV!;z`j?mqK+Rjp=gx;e`966EG$8WYl0#<7_IApI2_p}l$9(Q2 zcDAJhN-vp`X>CZtCKf(Fx@PHyN3;91t|Fr=R zH+S#uN@yjev?TFS7EhI76Ovi)!9}(JBnols^Hayb7IeB-bMH*_# ziF857rKlzr;okVago(f!NCsZWiTmMn1B_zdu_q6_%tjm|2c*LXtV%gN?s$Jc6}_<~ z%yO}riHQUrx5SU$&F|P*+YfHC;k9I@u?nDnHv5w3YGMQrO;}%r<>&WZ2cx}a(i9GO zeyIh%<8b7G>IrjmOZp4rQo9+ZX?TM(4acWqoypp-Bd3_mk%WPwIl|gdP@Mw+qoE9i z${1f4zR;hhf>3eYK7HC@nBUNmPx*!8vuocYCU{X}@Ynviw`iN`t9hyI>%7j|;OXxS1cJgpYa1Y$L1uBsldt3RxDtgTTe0e~#wK7cF{a4$mTK!TNr zHariHzz6WAxDzI)NL5+N`@6IGDwIqf3WqUMZVxnZN-%Q+?Z|3#&`sa}dCzb#_ z=U`m@o1K%sJ=667D7qB-Xtf3y5>F!%0p$068u36+hKZ>L_+a>)R^(1N4kDc%PqXvQ ziPHm#C`7!`+0gnG14Bdi0-g5O%V<4MOQL`VrLdin79^ar(&~ua@=FpUxHy3{iAu!r zmi+gMsJBuOqcX-+uAuqrbzzCgtQPG*8Sh4DDCzJz$clJ?NwlxPBZ$j-*oq9zFE20u zJnXXV{K#lAm2G21^Kg-lm8TB<$~C=5kro?0axMlWOUeHl*^=f(XR(^fz$+Fw+& zdR=KWT(ABvH8MKdnSiZW?X%*^`i$$Bl=Enl2AB$ZdW|-3YQU01jNW`=_;n`e^cTjD zysqwhVj}wWBw?!pwR%vI2?GEH$}oL4acVk+gQP#fM+}297toWm6GilaMyj`0Bkzy3{fS(h77j96P(a=H|OQ5gi44;Ox!F z$hegm3mHc+y|~JHx3RGy->M&UNR~W8kBKAU<#m6^;~YgXD=I3gS#7x5CiezYP3?f! zH4KT!3xs9O`dqXs%z7r=UFGaigofy&Z_Ou=3BSch!cnH%%+V$y$xuNww zfu21)yor1#4M=(@-4J+b#7i_42q7W?0Z0l86OK7iB2MasV>~_t@vn?Jq$MCOHL|kv z6Y^4Te9Hj?t8{`Wa%$#Rd9GDs_u;dv-Z%HxM|{@ThapgYFk@us=;$ChR*X7NztM5*SFl z7mH~VE)>8{`Mq1Mmtahp)MuDU;^YLSORn>%eD*Np0Y)nk-!Cd9%&$UX% zQR~PQA+@z9AeEO?HjY@p>+gJZ(b^hK9LbIM?eyyEBZy}*G&BsA765j|KsUXKLw0jh zAXcq+=KbCOZYM3Q@Y<)-`Q{je*ll>8{(2QJ)sYtZedc@L-d-&_rS_a~5#PvQ?DF;2 z=T1Q7o1f-uKHns3y7k-4fgXxTGWpGa=`HmHn{hD zIL$R6lV{#Yh8mUqI6Eqj`s8B3MJ$sDt$@biYCEw27{;$g0oW+BDT7oQn2`J;>D+zT zv`W0trG);Gtams44wzq;i-muqwEXbF6Hp*4kIv&c_V@7|u4vC=+R6P`d)>bRo=b-@ zB4?ZV)j64%REm`p){ccrAB95320GBBo@e5Z%?)DrB&yWSRmybE8J^WOHFa$rbZo~9 zsr#*@e0czy#oa=lf3AS@P`$YVb%l7KDvv{gf5JYnPF!xX9ltL6)wVSpO90Y_fi){J zzmGKeAT0jQ?>ZA;PE5k-RGgG_`oR$FI3SH?GK*91^6!a=h{!4X8-({$YhA&sD-Vy4 z84j=XpKgrOHkjs8^SSRxjaJE3Rc$l zZ|FiT8pJHq*wxh5LtDNp-M;>!K7{y+D=6%^$#T+eiZ9*v^Y4JBlE6KRTFrTvc8$#+bQfZN9O98V^b&e$yEq}K!?Gxs_(6f&zbVS|iEzD$!t0PfyetB;qh$j?d@$3 zm%ZI;DKhib;oQNm-e7)W`>$UEtMD@J5#C^9yPj=IfFL$)E_Vm*dKWr;rzB~=Q%10h z&)4V;&&9Ratq*naxgVSMy?%N;5fT?o`gZC=C>7UQa-+{ ztP+XnH{qbJUJyS!TxtNU9FXz7lg0*7Ps+&Qa=g+eJ{AGV=CD1ISP4A3&d1*pW#?pQ zqtMQRdqF`?4rtu(t*m}dtUZAYFW`{=o>>j5to;0Wl8XAgg|P@Qi)4cV7YbaVc2$G4mgm!%kTdA})Hw?)CDUtb zv@YO%gF+(sa9$$Epj8#n;jz*8H4qYHBA$5fp_r&+CA?XaiZ^_^SqBe#?d(bb*iZ|7 zR{FcvZ3}#UaMD=sB|&E_YxQT2LF>Kq;UX)4jov~{8nb~HHmTPPyRSeO3^INv3f%4a z6+@(qw6w=lc28>n0em&&pvgkx$5@|>yV#hRArPkXp)Uj^JK+C5=xA$G$l%XlBqH~@ zy=7%*XZQa^{N_t5BLzjU3-$i~r-@E!%U%KRYfyE?;B$@v#cG|xY# zi@lI{2CX-Vv+gJV7zS`AW8dZLhAr&6A@e0fq%j_<@UtnRegG-=|D8N?-dh#XjWU_p%hez|_+f#1ttPqOBd z(S_$hloL#%@BOuz$w(Oqo|g#)U>v<>_o-*K6AK#{d9?>+9u5mWK!2*l^V%AK|LI@e=p(-q*c;)#a9%SnK|;H(z=0!k3alR^8c z;&uAhcjr<`yF64SU)xYXYSqXFqW?3n)b8x;1QDrRR^(WclI3Hjd)q*Lsg5Ego06*GDa$eJY$$2&Ms|GIhM1|#E9$P2wr*zerkTuY zhx#o$dzSO9HN0wHL%<9F?t8w-ALKC?d!w>IinTQmbw<+=urU(vdIq351yKJPc6s?7 zsgmOo5K!3LuXoCTjYL+~>tS04#OZ)5*_iIUt4w?Q{Raq<;rBg<3d%QJZJ!V*i3H5W z#P>@1(_dIHAVKO;mDR%TaR3d9ii!%@dSl|_N%-7sp8Obb{QaM$;z=6%<)PRXFE5wa z-PCqLU_p3jSiRkKl}-e|_bw7agV!8A95n+&CD>!@6ZN?4!|()`a8D?tT;wd#VpYwH zi_HgVO#iw31WH0BB3BtYjk96Bf8i)_+q3H;`^o7dz|;9T)jktxG^ zJs__9di?FDi#L`n4k>S%q zzanCy#zx-zsi$&-VDH-yhgWlzS~R{aAZXTRt@AE895pyNSg6wg?86qS4Y{xjQ?Y9 z4g_N7H#cu|dsCvFg1&pN3s?#DF z>%)KyhtGh<3if?o|EEqD>tYjQV?dQ5CLs~>y8QhUI9%2h257@0AmFomU8X9@}EnTs05Np3eK!{w)%RR zsa#*(I)1>>obK`#YqkljPt?+=Hr3zW~eD&PCydYT66|6E)f7j9t9l!0I$}^EOUa$p`S#>5ui%UyPdfgA8K3`et zE(?@Pd)rU(l6`!EjCi?OXEID01~M!btF0G;CWL_jimj26iHQj;Z1{uuw6ke%+R4#! zN!{555N3gLPD(Kg6gpUAN@O&goD__VTL%k$`r5ca4oWGG#A#v!W|hgG5IFnA&!$We zD;x#u=t=E(Ja$RF zaQ{Uh%V*{LI6VeZdr7L#H~PanTu+Yw{NZ(&Cf!3J>h^hzGEZlvCqBd=UcSF-^JT%o z!AZb6B{fu2OBRoU3I&O~c3|#NJer)?5B%U*-V_i1{COWQ^i*bN#S22oxonro71ZB@ z$PWSzn^q8D_brDno`|nGUyVm7JS3ZETxjzdgm@H5#t+5bt>$(;^jnTNU2@s}{N)&= z=MTXdFSStL=<|Co&40ogATy?OyqL@ZFUMvKqwJeLkQfE(R|#S`i2`-FO=A<-j!UMq z{$kpcAJjW@l~V`ve!6Xi|B4KyrN3KADr#tGs<6IgXV@EV>dsI}1d4&B$jA?WufvYl zG_HBW21iHte+5DY8yJ9<3mW<$YUk%}U%97SNMuuxjP%oQsD>5^8M(z~sor!Hqj9~( z*LUW}4}Ktw>aok|Z8@@tir1*t1&vs+CJTDZ3cB=ONLW~yZ-aU0r30F>yt}(Q*4Xoo z+;#TI<}|2n!7gRUprt!sb$7Yc0B+>q@UYlCp@5gmO^rdQywF&v;>5SGJ+Pg$%27tb z!+Uyqo{Ec$|KhA8cnz;I(^uFQ99&!b7GY4_SIoe`Kub%jt}K?K;Nm^C)25<}3y0tw zq6bGwkkk2c;&%t~tm}V8wvn%HNO$s}*>s5O6LWK$x8v`&69#1=zPpk4KpvKec0)#( zB1bv@p!`vAfBsqRvOM{(SIf&Z=078m)feNDI=-n)i+GN{=JKvx6+QZF%;K|$bCI`@ZdAP<=`Xr;d- zdjB@@7trM0F_0MLz5pTXOKl*2=I>%~H&x-t+X6I=i&+rKNTk8Quq5iPHa1;Oak?qq-jIOz;fg6hh)4#ffvv_Gj}+dpk@jS(^tR#u>2xtO&hKg^wvxBMk*~rEt80Q#v`V^7J#dC9W zClUwo zddhwd+fYBPMC_?I2%?KdY)~ZTaBkDx&G@9Qno5I_dtfGGx>E|ax zd>p7oz2{Dr#bq8AE;dZL(nfK0&1`%#nct+Pxoe*@%=@sInu?8%`ngK~c2WCoU~KG9 zSxd{pSn4g2KZv*mUFY@w>fgohMHEso%F6E*2TEQfZJDMV8!znlENu?pN1Q1mBSWd( z*7h&aT?O}r=+2xNu$4DiI5bP3B7H{_EiInc4GGTi)|_{wf@F4gQLdCGmdzGk9&2Yi zCznhp)2|K|&Uer&j}{Y22T1#gcwL5QwCW0oPknK@d^j+1gk{5*TkN*pW;0SfJm&H7 zHgC?8%@U7w0OcCBJRQ7haL}gR`q=p7Tw{w?lH0onAd>(NF+dK{`dJQVKgzl6W=cXq z!S?oau1*5wu(n*(gQ;B z15f6=(f}EnZ<|adEmHdO>au`hM%7=L9jG-yxDogjC}`L(P~{%s9Ng1;FkiWXGW6AkS7R@%XqTcDBXSuxG0t8La%j1%i zSWz(0ysf0&k%r5G1|}h6=;Q>=i;KVM*?y=82{TV*wcrLxJ{Uqt=K;CtYTI8lhR8LT zu=Is5@0rsGQHfV};tV5GzNFAYhctASN*Zcv)-+1NtGpOMjG=pZ!Mu9)B#Oy7koQR? zRmbR1U`2V#u=1TWhnkdXmGv?=pF6L|`>A=RENP^3{*Rlqn{VK|kr3ZZ^fJk}Rs|3q z&UY=mumn%v1qDGW$jk5Z8_E3tP>_i^`9?J zbkMSveR8IT(P+617GL5aKp3+N(4O16e;2j3v;^>E5)>Jhx#z+$JNt|z4mD1SUAc3z4#oUgs|BEaoq{z1*eP-C~$nV2&)jsuIOW13%6bKKun;q37zLi#baL;P(dvg{iWOmm_DJr<{^KOSzj)# zp1cSnj~^$~OisTq7#3Y{*uz@$b0^CCDyq-Flaaab5uap<<5EdjO(}qHnL&%sB z%w(xV6(t7Co&$8{KAp!SXg4&Me*WC7Cb#?ZUA{@h<9YPX12oVG?#$ihWU2~FWp-aY zJ^lsd-15?L+L_}L0hf(-m-*xpyG7FiQR&Ed_WAQtFSjSdaY3zF^ zr=@AAX_&rFMLCvER^`N;Nu&T$#XA~U9||A}yuQEs?QF3T9>@B7POiZblc#riN*(b0x>}v5&^7S$6A}_WJzOZQ^SL+MudlTzkt~vi z0<6OVv|d+#UgDLPxojn4i4A%T1&-Sk%hnd`*A7=3_L^(oj^Z8@aabl~X6oweuYlgb zeywvf9U)!7dOik^Jskp8^W$U|-y0?V-K~queKpgm$w}MWFCPGk7L)TU5|x;LvCVoR zhtD10AMLJ3ydWccIG$uGmpvR?evl5Uva7zK;pS*P9wB6SY02|X3LP>&M?`GwYJ>Y* zJE^dv!$W;#b&(IFU0AH;^;V_%oC?_JFFYzV4O4|=$^NjpFr?JPwCGd8%*(^S)FChe z-utc_C#h*!85!sIt}Anu`b1)7h1QwVeqsZcBrIQthll(63PCOrh<%$WReClVO`4rm z$JL&%+81;-_wnJ!*R2=gwQBFM+Eo5D~ip)^bx@Ti@%Iwih5;2JZM*u=(2G-`_#6Y0&8s zIhnq<(l%o;5fK#yVvPT+RTJbc*O^G+#^du~Jb`&alD7l~8X81(!w(SjtI?!Usi{m1 zpbEVKRfJ=01u??07M9bO~}E; zMchw%sK&5ubK^L!`5E98R#sLZ7QEm2=;+p>QS|aF>PLxV>`^d`yh27gzP%^sY;Cak z2D&lguEW7>*Hx>1E6Y}%;CGSwxnhKMQ%_4H?1`6>+{6p ziyzZ#MTF2k8X&tXqV)f*>A?vO za18!8ea@c^8;yZw)-thPl zRul$b+)xFD7S+_C2n;}(ngSwXKI94vDzj-l3)}VaD&O|lu3n`Q3k%CF&`l>|dU{4$ z1eRA+IOV}Lk-be|qD&F{FEDs0YUWM+WO$vudg}ndgP>X zJUl$jR|jIFK-{TnS^&VEsZc@@2}Om%fM35zwbBHI4sMxb6KO3!0+M2GkL%hy3;sm~ zMI9c0{_MGPY(RgR$iYzE0J(4Yr-wEmhMqRn?e0s3LDuqP0w8FR;;#2998T!a0Y z$gl@l&&O*W3jlXw(reD-p87)cI$oqodj)Cm0pi`T+U2Y4RBTnA!;R&XpH0>AciDq9 zm``}XvtqGw=OsMUJQ!*d5V)Dbbt39LAq z$TfBQUkeV?-sV_HX(upZw6MHe1n$f)zRjwlCY~#wzdEu*j7%fh!6QG8%~vPHawkkA zERUk0%HZW4QgP`tb1l}RjW9WsJUT7G_C{7Qck0}pMd`2?RH=>yfV(5yU_QA!mG=y` z>0<$}%i|e~FImfAzRT-!2<8&LaBs>GqzU07z%;M}34gZRfNw>$T9 zDLOIn<)ZCZS~tM{zUa8MC@CP++}sQaivmsna{y_-z#0y5ii%)-(Nb+B#KaU!q-*uF zaUWDP#QYx9)!y!=@OTGHm7u|Aa6AD^WzkFonSd)KCcWqVD;u}t6&do9{*@IUF!(q- zJL^FqnEz~V#9wFKVrN*XsWV&FtBnP6V+#c_hfZcWpeJ z#11OBh=>S)aE6A5ceJqw0qsos-T7cp z-xC-~VRgEntOW)JN`DEgW-=xG6Ie%!%ZexgaKO`pJ3virJUDbg04nrH*m$Qwk-Kd` z1XWQ<6#ww`vD>muoL#g+tG1?MNE)*+VR}0`-{o?jkCcZea+%=OCg$d+T5Tz|u&A8< zjmV$WX#*eDyK`DYv7MzY<`3_bi2o+Ozk5}(R`*y7WG28lD$JbxC!#6_YXps}=m5$0 zv}wHon40uC#l)bod~*n0M{uAB;k@@QAYhQ_0SnfH&IR)c%a=y#uHu&?o%eFziS`bav>)-so1TVurDe4RG1ghid!r zicU-d;Q4W;6T`!wL6tAc&`;*&_jsj+3)SWQ&hNp4`dSJU5efb;6GT^6FtZ)OO_HF_ z`0^_fEcBH&6IfMm;HCr$QY$?jiO1G0IY)%l=SblPQ0|xo^gKKJ6c$j4ydDSy4*74U zdh*_=p?o}9as09UQC?V-6tUg)#O#CJ*Eo0SSu=8se+iY1vm~=I)wW`w{G=u&6Ndtb z51Gho)du_BuTW#iV00BY6qKUVZiBt-8`=__{$e|If?M*SEHjG~x&vPJZASV}fcAoV zvM#2+|9Cr<18^X=k$fb_e5KxeSZqtxpv9&GCulKy$(^)PCpf4m_`76B>DUm zGy(!5v{MW`lM4H8lWV<#j#obOSV_qn|I- zgTbZ0>7wPr99FYXd7{Y_j0(Z%xv0{cBW~=4dZR2n^xQ@XMiB- zaGHQye~s+Q>DQ;Ow;YDXacDzX-~F*PoTf!nuVv$Hs+pE0AO%v%W{Awmn~7Pjos}ok zNW~J$++38XW3?VBgEI+?%1sF{nvZBy1GpQRZ5a9%Cv|F%U(TCR$_`mP))Az7@#a_&`9xt@56^0g3wYEZ(SD^eW@#lZnrZD97lyn( zEA&MN_vwuqnzw=@Lswa-t#X^A_teLKyaGtwWq6>bSN~3he(F|}C0G*u1T|&Y`2G2+ zLtzM-S)Dl>7!t+)S;G-c1%N~dJ!c0=BUNExyHlCKh>dVMks6_7 zMa4`i^9OsqvlIPjYfO74xuDuieN;=2`wk@&)1-6UaIQdyv_o&7L~iayWDh+l4NYZ5 z;a4bnA|dPI$O^oF{s55Lh+^X6{{Hg^SaTp!Ge!Zm*2^eHK0p<(z^W9XJfpR>6=3f3 zw7X!_$^pVlOFiy2U3zAr9RPbg=l}787xfjou{?eh?!d4`HT;7UeaKYueaiHUmQ2+GSV04kFU z*s?);zEEkj-mXaG0lKG*w6r&%asxFWaM1Z$YEf$ldUYT>xZ@V^;D}oFjgT++F7MJM=C|w-+Faq%~7~}9pS-?g9!3w$s^Gt(A57yDJ=;(2O zt(xuKT?_(IkcmA`?Sb`m;rAI(X!{_?v^6AlgNScKc7Vo#vdx!~Tg0d)3ZX!@g z{BtEGBFzV1BV)9%b`jW!?^VC;=ny2oNt(QsP#rG3&$HfpK$K>D}Y$Be`bJT$a~>}-C(1#8RAH5EX=1VC{(DV&RoOF~d! zU@thojk>(%z!V)cVg&g3eK7Z1fW8Tej1M+8F#t%hMUO1r+}bMP5~~5#hZsy&z6mv; zI%SQ3;M5Q(Q-Tbi^X-(!CM1p`SyY)H~;J|pgyu4glS?+s#t0Ds!H!?CJ zhJJu0O3TMA!ZVpC^gOZD0I;U%g@s{*5lBH`cp+%F!VMiN+c7F)7WC~kv&cxV0DyaC zzD(pd1dNA)f$QDH3NTyH3glX^Y;6eeB>SM}b`ihLn4Odf9pfJ?-wHt;@7Tm*VV*yh zp~cRjqLN zYQ3Ad+LOqw;f)0IleMl7&Gyev+1V%0{(~IjQ#l>(FAoN^&La~5P(q8W5?w+I2521k zO@g}~mosuC<-!j4V!9f$f2sa zBA88J;C$#AC4?S9cHKC>8Gzwxc_NKs@d;lYjwh*sdp}=%GY?It;Db_W2<9Ees2my0 zAWuzwaW~~o-pFrlU@t?y-W6;}H!!D3|0(5_bU`z@5jOVbEl=dr6Xz#A7Vn^-xZ@|R zyHnA>DzXUt&{KVwXZo6|sc;lwsNpfBepFN?FX&u-qxiBA04k7?kumH|OtuaW2?5wh zj4omfu*Gm7@3gx74G@V$T&K7Ey`Xp?OO#fwHSB`l8a_BUNG3f5N57q|?IS>i%6?m7 zzz=|1b^4)S9|oCRKoT&xj(#>bS^rGA*;fPbpfUIcvq3i@Iy$S@Y#Fse=2wVBmZz&U zqjs15zMf|S^o@;MaTyuaf%RH@BQ>>|1hgl|RxH_6$3wZc zaa_@2D0@-8F-#b{u~i0RXY3 zRZK{ENA}EBFw*j1`72RGn0%0UJ_Kp({%jJ`Nalic>wJwr3UoraUwxXv=IJ`t+I}@? zS~ky?)iTkQwy3hr&F1XO(zVPGo!wm~Vv>XrP^C+GL_gl-xy6T;MT3^&>*M3&>x;43 zc~eZWle&K7i-rB@__$Rb_Mydx5tRMj;0F#gq`jQqNT~PgO{ljr-_-N*4$A4k3^?H` zc0x>%3NY%-%os5&n0TX4f$@z`SqxV@X(TB|bA6UzeF@%C*S{_RRLY=TOb1GMt%W6v zQ-{C@L@O&Rt8(}*<>*pBXbubhhWemdiC9k;J!c=kgw2^Q#v-XdBJ*i3hQTQy8~vs} zHDjSC^Wa zI_6A`^?uXx~EJZ85&}%zQ>M zqi*s%X9NUOP{*H7Yh;NAoSs6+@HGy#*Pz)Z;$^g8`Gk97Hl*bDtkP-ueG2m!?lVbM z9}wspe6@Ugt}wXMhS>kR9FD_V1jI<(1U&xEiIaSJiEpD!We(_=F=zG`r?g6=*(pM= z==EDuCoe)(kT@^=91P*7xHfkA`l1# z1qC9HXpvmDq=jD3-Qjzhp^NKt)F{11zQZuf>YZc(uTDFxI~4MEB52-)ROaxY=0eD? z;AL_m&&oe!ywOR1&B#KAE=Rq_@4u_$0^4oC7y;y7JRW~Rfh(bmSEn-sNf1*Hyr%Fw zP-}M@c=67&)38A3GT1bXt(|r*eiF9}7`b#hUg^h;e-$9pI=?e6w%SNtQv)B2N{JQ9 zF0nCiV(x}Possny0A(MYd24huCiJ5?R9ZIuCe&4(i@;m0$;k|Yn1jRCnu#-S3u-$t zYIAH~=IJtyEW6`eTxM1F8)%oT&$uzJ{6%bD^QSOHt_;lNccKbiYWvFVI0J6!tIHD$ zVBTFB*9aRZhF8b7JO#3R&pW-C<2OhoWp#DP5O@|DAWIyk7+qbuC6FuYkV3(3aYg0j zq*}SN)swAJELg@?-MKS8EAMpl+lF#3nvCC(zW~^salk_RV43x_cM`enx@Kl(F6r}& zC$SZy`5O>8 z)&E$2SNT|*tqIDMN_i@T2^cP2iJqB3HC#>Pg^*npQm?}^{92ZAa#u^FW)Ro+l8nBJ zu`g7tLOQG9SLa`^S=lw_mHw;^hveCKw3%-9E_8Or+THVLcH*ekeN_&NupA7Ohl(zr zqdJEwoel=a80^eUP7nUFuC!wZRVV82h*bsz?HQvK692e40W6;jFYnE*tgvRk`bpy1 ziV1iGOA5WCwrhC!GWfO@RlOV>#vF5UBE#1#EiKcMQc#(Y_hqyAx=j)u6!lI38vV&_ z{Ax5Vi7$JA&377dzuwcN*eP{-yql{l$qF;043E+}vM;H;pZ0fFR#sg2Rk4JgpWXpp zSNC({2^^dzy)Aj{0@4~{;I}QfG-2>y$zT@nh_Y+;Srl?AtxIsVn?q2>;|Y)NZgK^U z&$wH7;RBIV-ap)bzOcC1-QS&~AEYlvPB^bPQ{%7(>HVw7FSDuX>Cp8)sjW?Xx%?MI zZvG>f;mlP`GjnrSMk;UB)O^aRbb`xb$sthOv&Fs#?jA$`=#mo8NT>0LwPhsF;sgGu zwR{Lb*{~WKk_o?)lG2`)KFnKvl`Af}=q(t8ecee_@D>xK)f5kkg5#x@U3D9SR+om7 z&t&%8`nhTi>-i&#t!*zM8t6ao)RdJyc9Ic>K3w_SG32lUhdA<%HW(Ipdsu7|`pp?k z>Y$gHtn7H5mzAbwlSS0lSXE>M1Q8U@HMP-8ig6mnbh?LElwvECM`Iy#w>3kfMcz-) zpZ_!lE-k2EQ@VQMCdHG$tag2-?AC#e{1%##{G9b|O>}W~q|mL?%{cwl=yldj_DJ=B z#Zee><=Kb)XIy{7im5=IDy)1V?zmZ_v`zs-cq{+!;ewpnZuXbf0o60pbS3yIjfub6;?c2;(_xTKyJp>}IE{+Dh zX^GswUuTt>PoXT1w*2My4lf~b2h=UC$iKNg3<|gGAKP1`ww^%KZwm;R4VyazLR^nJ zE&#iOoZ(dHAPLBY=l|*ObY^lA4Ck)6C(lPm+patkFHm;By8?S~z|t{;Q<)B=6$D~G zO1BLfx^c6;l$T{s*NI0NzKzSx&e*xD{vo=vNXF<1D?#f+v65=JiXN|%VD9#Y3nAg% zo{Pb`oHX?=a$GWo`TEGHcG-v%mM z|ERhM1_e@+-eF=eiJaCR_|me@agUYPOBJrl)dK_ii);cxO3(BkW}jl}ZF_mNb6nZS2h0YKrF#--6QrMQnD z1COSAx%D!r>lDHOB`;-yp#G$mM7XB}0 zZ7zE|zMCjK&77FMKfu3{bt)8(XQH@@%`;Gh-3)~MlP(@06^tz`0RmcLDCDltW%t5j zLV{A{>jj?2_hn}-l$Zi9hCdA==M1zXKgVf}*`O}uoAzc)SrNZ4H9|}y4VqmkBH(^i z1n)FFW@b5ZA}Zna%%kn(BnF^08W%NB1dK*+HJkZyrG7T|xeW$UTB#+TOKAEnPy9oO=H@kKn#gh`^r0qx*i;IT$u69bHDxI;- zS1PHkwCU1}%sl8RKj)A+?t3?B-Q2AI04vIW7zxp`KL2#Bh%1?sZSl$2xjv~aAi>|R zNUEK2Qxe)oO1S+t*!^#*naHfu-E5pg+`MRS0OakBs(2m5dXe0>bR=Q_`|j)>1tt?XlOi3muZXCA2wJ zXIq!>17Hh7is^n0XfprQ#yuK)r$Wau)RiS>a7Vif<>x`q-dHRo^~~}pMapxc*C&^GUi z=$-#q8bMTC*1o=VOf?pu5;Dgd)S+pp>SMZg6GR<5?$gY;+%ocxcjtPmf2hAmPix!?OM!O`uIu_f+w?lPsHY%5Uv3{e*fz`*vC)IN)AJ+W8f9US+zQyyj$0U;eiglO`KBk722330J GWB&~oY(3`y literal 0 HcmV?d00001 diff --git a/tutorials/nlp/images/spellmapper_inference_pipeline.png b/tutorials/nlp/images/spellmapper_inference_pipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..07d85d2e2295c4b4005ab33d26fc10cd56936a39 GIT binary patch literal 146148 zcmd?Qg;!hMw>BDDqJE$;3PrBGZ#pg?i=;_mLnifeFpm*Vd3#hu_TH+_HSJ?A@j zeD^Q7Ym6i#J1c9iJ=Zhmn)0krMR`e7Btj$r0Dvm>Ra_YWc((}vyvcg^270BYeLfue z0b{Q$DF!GXBR+tBfi)4869oXOqLH8U;Go|Te|%N92LRBz|Gr@QZ3+zmfY&@JaZwc) z?c-HMftdqRgtJ9VX$-S`VAa7l>1<`WT2swjF&a8=NpVVZ?nN|*QggNDpL11obU1@k zOO;hXC6%1W{K$NJDvZ1cmDlxi6!wuuNIN8b{^lv;eiV|?31J_<^tisiH2FL2|J6uJ z@E-8~*A)Q32Mv}L=l^wWAND;E=6_%Fkq7+$x|@#Y%M$_4{|H6Ixv%Wu?f`4$@PMby z-J5c)fLpcj;p}`%lIP-&%WjUIIWmK;z$Kp~)2T^a5QpH*Q}Cqt6y1 z`DDl!>*(BzzBS5y1e*ceU`^#_JF_@(08fA4vfw`E|3xeS@JFU+pz+$KvwKr5ua~ky z2Y2f=SQzzpVo^mdtYalHNZb17iim5IFJBFZBw9iC9$-J{WB#Eo6b`@9_2~d2$Re_T zVGRKM#tXQt-o8jEt)*9g#Ss(*3B1iiK#7ezC5H!V`!n~p+^(qQ8R00M&=N>!j7MmVPmjKpjF)Up1J`aer_~ zSN#@Y05$$sVq8oj(Lw<_tjZr5mGe_iTPbzh!6P+Fqmkffo0WFSIR*R@4wGL&V+^Cd zY;Ok$0nKnBMSS?sJDmALZ!q!ZUr0|}!UblA$~>hqlqe8$%RQ%>9n!V6KZKRS0A^gC zBJ7`cS4UD`?ZRX#2S2H}YL)~-Wiem^v;Xja7lepH-)<`(vD-u6r~u-*kXucGJn+@{ zY9M=`7vS@bw78^V{Rz)~`)Rm`N^b9ncPS}G0cN6C^#3k_4@XYco%tsSm3~u&8*|3% zo`P<$>gWK|oPA{4&_T5-zo9!;FaEi>1b~Cld!QprND<%<0u*2Wv%pya{Y=(jWt@k}DBq&gD z-2ENK)K`07=O2UsL=I+7*LDm@PP2opG3wx zcT{=H*6;m*rF<_(q~A*w839=*E~iLtV@%q{*0m#D@1U4V5Gn8`L4oR&&wAqIfsXCP z+P7B((sfKMa9<&0okuQ;=GHKCO`zHCd4Y0H4y^(9@zZok0~^Tc)I)ICtlAVFpaJu* z_D$FyBMZFv0R=1vbO#CnrqmWEqgAh&JaIo=LPh8P&gzehvPWj3lHLbFU17=Ij3biE zyZQl~{^UXCkO-v91M;-Ej`$Pvj4Wef;~FkIx;OOt7^(!eN*p2Fo30{%1;;a z^s<#*onWOer6z`yf5+w<_}n*lE$KiU+JCrS-zWIxI;-u6O;BFIkBI;ug^2*{TGam* z=qDzf!LFB%g>TL)^GJrMfGok036(qrwQeM{o>a8IqmkrquYs>p@?}Yd@;hAohqBV+ zcYD@EzW|mHp{nywyMk@H{jf69M9rS{>fnf@0$pV8ZK9KlgyZ63bm*V}4mtG;Rh%VV z+9i(hZ8nuzHzV2%V3iS?>bbmGOCWT`0+p{J(+(;WSEQ#MCPqt>AhSm~vF`=6U=4r< z927bKG`Ba^pni^Pn+KWeJG~N^eGe>Z4?zjr9{U@aY8-OtUDS>4Ysj9~yCipT#NkL! zYlz{=8N(Nk3qv(;LIPhkd+16i=2%EZE@Fwyu(;S7m&pZc6tjdDqg>Nhk z?Q$G&4WAl?_XB&+l{C^WgCVpS#+0eoKkqmZOu} z_H{ZMgdNrhYt_$$#b}1K)14|wFdwbumUOCrwFTbWyNZM={WAwfBw-r+))*o$Ivr7GIB-Q30sR2|HC zyd<@uZ6lSW2Aj|mL@5rtZ~?D2>{(3djCihiB|RaWz31RCKrQCKn(kmjD(vN<9`Op| z{i+%S-4Gp67Jcs;D#LXUSLXiuJ|gpH=aYFgOkVc~oCi@^Ew8zjZQmlO?gSKg>d2b! zFj6qD>Rlie^9GXvG?3FElI~x>u~%#Bic4Mk%YldrlN5nu+4|Y}xq6+~8UZfzL;QU_T(B_h+>XBBccM9<{g+MqowzxswVT?2 zW155D3evcNI$Q(?FP@KF_Z1+3i002kqLAlTBgEQ&{GH}3=Ym&^7_M_e3Zr?{IO&Su zo(_!3p|G=}`o%a!8{pO|TO;bY|KpW0WMeqme2$6VD$rV3rO#7<#1mEUvFPlI*XiU9 zLZl9>ByIu`(2;(PyuQjwN#LMLstR36U0%rxn=WiZM*G@A1vV?;Umr^+fZsWpsO?$M z15KsXJ-XN}OpWr6=E&bL+rS0Dm(ohS7J#7v^erzbrMb^7XbFAj=~S-mR0g7!3ZVD1 zo|6)isU)%j$y&8gBl9;d4WdJrf}DuZ`wXirI;M2`PyB)BypG+c7;y`PpXW3!=r@?v zl(_9z!`ptjyrCTqUxt`}Rok#q_)1tnC^$Q227v{jBw1`Nrx zIGjil5eL8ggC#2z;~x{iob3;)pX7r}cdS`mL; zDj`r`#5u+IowyZn9QjKRLs_rqF9Lg)HWJ8GZv~}29^~hKsT=<=Bdu-^?4xxp+ZM^; z(!Bx|tIUMuo6|2@GT7_Ji2){s%@XQi3Zt63hq3SUxm|~1TPH0*rO|jlL8bA3QozA& zMk}6Vu=3JU@?Cmssc-n}Z;rZ^!w6mQHXFAIL?PH7>?miwRN-E?NEz7D>Uzox67;or zJ^ZX2+T=&a8xO-a4mI-p&2)^v`NX|_c#0`o>1Bt%e8IdFlV|h(F)Pz$*EE<%UNRN6 zkYT4fSI+JO^jvu|z>~Hh&(p?~K;G*;)d{V5-XD_DJV5(A!QQBLUo~&Begejouh~W~ z*4`m)iFl1VzqzR|~g?oQKTY_W7Z<{UtR^y+Be+FI*y$N^2 zbNMyD_=o#4YGb*T&MEu%JMCX%Jr-5{82Cv@wFLhH8*hWsmh@h4GR_mGe=z047S3es zXX}qwcL*YqNAqgR#X9TI?Y3h0uFno3%i_;2No_`v{)rn)n3BjEyNOY6;QhBr6GHga zatb=w_*Jy}RNjd89?E?~4P55&KPxga?`|Eq6sI`OWxXQk;8fJA&%D`&>1m7-Jx3}z z-QcO$$#ZDsVYYIkYdTSCo|Esi7Q+1S6;`X(HSRNH4KRrUY@!P`#J8@y5?9Iuqnt8} z47azj9C_n-gj&dbK}ooAYkL17@z4>t*aYm)B&&CVM&$Ab4+o^U+B+|5xVvM-4;xv?I?K1wB)gcX`_DAfD zw@DP0B~8tIz%DLd-yLo^`&_k-z>i&hcdAQam=KAJu!Ootx>kQEx-n{}0-x3WZ(!H{ z&QO+9Rcd4Cdbxun{1d%9JQ`sVQv2vT5$ccYZoL&*>%tSg@f5C_Zd6;hjgm*p5Af;I z0(eAN+dH`xAvX6|4~E$21w6{Wtw6abGNzSzk($4!kp7N+m9|oe z6@X@^6kpPOF!zfATNrXF7!s~A1c|!#l!~pOX_Xxj zkM5q3+t$%cXxx_(sMe;x%u&S5zYXrftF0ukjnw8%)*C(+hlm}mEEqhuf!@ER)b^{q ze%U@S`|2F|o=~z%Uc>c6MPjDjHEX@ZD}M-~0j+$4hJPFCIhIu}Z%ScKB~$6VPAj*J zA@$sXRI>hVVi2Q<=yYGx4OwSpTmv715#k_;JKwz^eMb`~xLL6C0av3zj4 zfcsvIN?N#=(y@6yPSx=|!#IHIaBFR3w}T?cV*LF#Zh8IIIUbcBK}OFg>BblPeI|cJ zg4@%B4919`OLDP4Sd!;-WvP6UyXQe419Pmt;t62$oSU)#y5v5~iQnA;V**Iv|H~ym z`$9uioo05pJ}`_bfR{=9YC@>K{3WRH{)HOJPHl$&gccfE$-;&u&+dcuAYF+5MokljD|1tjA z?Qm@RL!3l>vN7WMbYr^>a$s$evC307j>Rv!Alm*CfpHu_B7r7p`fVyJrW0L7+n)WEw4MvBCU7Ldjx4f}&gaR=g zm#=u7*X7eA*ILkB zC|Zll?aLP(Pd1LRi}MFLl@^+d-2KLMHUYoPdXe@5t;VxW(Re)T=r;wpcBvUwu)4PH|zTBta#HZ0=!2Y}cN|^XCzgbKP z5)kc?}ZTKOky2ja7j9gYS1B-qsA7b*|%`>ysS&RNn%(+g<`MO|m5LL+~Tp zspzJBUsIprS7=DtRI``*#H}(UamdYGDX{S#;NcIkjO4FAOuAx@a(v{RvU0OBAgr3q zAG5}qgbmCw#kw(RoDEpoqvX&MT%B0GBzi?MQ=ei?ph#adCmut=J~qS6 zs#k-S%*i3CZg-{4_I`35ZEO5Ii8>Ig>+nZ|;!iW88id8+k&ac>I=|2R6;fYk4FG^UeTqEo zReKg<`c}x~Xs;Uu9VqyY`!w1jvQL;d!rqY7hwqer4-rx&H#y~ssy-yuPO0yUCi=(- zOU@cmJm%q`rqa*NHepJkUjB*K@fW4Kw9Cq8FS5kdui<1eGFB0aCKgW=oU3m<#c%Pb zzN^D;QK(l}u@yW2JpXM2?wA+jm{j2%j1ku)JuJSr57?EB#j3-_2u$Ng56mQa_SCaZ zu}V~$b$lH2)XpKJk^;@KuD%~@Y_H%q!OTDihs=fr&|1{)cg*NZDl%=Q_T5vFv# zbHG{1KRe^>iV$3BI3Mrd{k|2jcKLPo=J9GL*!hLBr^uzd_`#OKXdL5Kq>mCeJVOe?t6 zPRuEm&FQG>I)X@HCHCHAz5Sp4lldP02isI;5x4u{QMFyiZvzmBG-MB|F7HBK*++%< zVb$}88pb18C#E4YQS%jnC+}kLAet3 z`xeg^=Yh+YQFku#$WO;Hk0@}TB7&I*#7KbD?*>OSj20S;>R1)zCt_(G3ffpB{Ho1W zW*s=pHEmK^tb-2&D$Id<*_e9ZarpL|K!%n9)6+Ii=uKiFq znoj*~yq|%5rggR?9iLRNQ;aLkmrH#6K%fS9U2tjM_gnC-*4RP#(_6#yR(Hbp)DwBS znniBZXCHvSaPV7@ebD|*I^L1Bx`un})u}OGAy@W_wb2$ETte{XRsx_YXT`1W9(E%8 zW~D3^gkCZZm8~I~TUo=(wdbgHO21_+`!F=0ZKdx{UKr|sa#0@Vz*c8*1XQF}HXFj9 zODuSU;5~yleKbY6DXe{<@awWn zo*U6u%!eFrix7DlUIo`GEUT;#A1pPjjvZBwquj63Ii2wf@w+BoI`J=W;aWP}Z!i-y zOkMs!vtg)0V#j^+2mR5mr^d(BD&+!kj1ZI3vnz?r#9RjxtB!wkv?RB|ND7`xF%7Bt+)>kmy89?K-MkZI#hwF-%cV^~MuH}#!kG`a0ekJGy4+WdM&~a|G$E)&7-HJ5>ws#^?8o5}|@h+mZNue(N>S5s|5B7|i&tyfrXok5u=T-YE5F8Vq z_7(R-yJ=Q3Xpvq!9I&J0NZ1Sz;?ycpI;~tPoz*ybr~N8qUSk^=6e;0P!RfNbABitF z?O>qz@Of@je0_({Bk_B_jS3+zpN6_}fHSUAms|sXlG|2iNs0p_x63ED{M4U6H#w8_ zI&g)ah2nUu+HJlv(R0UXp~k1zKAO}2aZz7bQ%3|BavEt&7*9HuJSeA{f_1X_4Y zKY$ibWel^L5^*v0n}yknbK59~{^`mu(9zF35)o-t8Zud;Ae`Wxsz;7?{)JzSz0DMePPz*^ldIxcU=aWx z{C_1Hv-S+I%8ItX)WWlHrhLvU?{ih(Bz4%Frp9=S`xXGTcAkK~*JLW%PcPGasWyGF ztKCY{E`l%T(I*JaoJ%YY7LS~X^Y$q{hRuxX(n$R2v`;rKYf9_ghJ!&uI4N_S0erS- zNJMA|)bB#Qp`m!1ATu~LKc{#!$XIefM zj>I=<6euGBRe#?ukP&;;e)RGI4iM!+uf?hC0il1dWGOK;tcBf z{#EY4$d3Bv*j^q}Zs&YwWfKk-U|MN~G&YZW_h!JY;kRE76N#-ZDv1+=$)fIe#`VEL zBDFv@=FlDF1h@xy>swdW7`n4`a6Tg}n} zhF(#)kx&vTHN5Xhh;~JRbQ83Hn*UQF7SuT}Q9gdHhL$}CA4=u+C}r&DZ6OYzH>mD{%5NA0`DV=?ZjN8(((+os1 z372A;p*A^j9+E8_{w5aMk^VW${HX;DXg2ijvtC5l^~Tc2$YRb6;#V2bmZ zZtQ{k{vyrCo~j9~F2z}6u)X-uN{2Y2_sN7PP)-0#yq?z5#@zT6*RVFGa%rq)?J+TK z)1W~pX`eL8xM47;LlocC;#ozNX^H*I9Ue!I(RPEi_Qu4Is96Qqro{6(o0K!0Xzk~q z*mO6$%V;f{og)vM-xq;f_u5RF4UzNVtp<+pqnLSkqo&jPuj+4I1Y?m3o+1yc(!=Yr zDRK5G{tcQkAe0zDRAL&o{KI86{XsI%r9YxlMQVe#>VK<)XW7ke{S_V0&*P5JV!Loz zjS@Z+%Yqc`Vv2MIHxt@E7aeV&nW+4sK|!+HY1CKUTvmKDL~6nGI>&V8KMFubfswlu z{jai4*NYtKG}Up=1oCM`77dfs`)8e5-5ICwbOv==IUz*1D&c2YAP>(9m<@?kE3qn4 z>ZL#-;OZg*e=I_qmSaKG^mVIZ0EuJ+CtuIyXqN8b&Y)&t%MEsO9JQC9$M*-K z10~aZ>+(Vzwdtbf(;LcQ&mok@%psMxf0Mn2o924C;=w1{HYQnOL}fX`++>MKZ8BLJ z$J)}`CD@0=L@m4A-#$ffY3?jd$ly5g<&|MN`X)xh-G53W*v*xn=W?KC%E($hDyvJN zVp<&jC`LCbN8PsPIT=Z~LlrSq9{T;^YVL#b9`_RrIOw(<(M2LM``kG(7$xsuzDKsr z?I`+}9massS3AwNb9iO)`DR8wd76y6%k8(L%>UQ7iN}F_w}kJ z^9k#fPGOd?xL7jt?BTkOm}^LD-rQYT2`Uy^a$%f;pqE?Rd>_s-18wACp)D=iaPi&*u&IT}?N`vWhtS7t+IY!X7S(J0_;Jnl2?IhXU(`y7QsRrm+QV z+&yDx=mG&bliUHpR#O;u76KN`qrVZDEv1cYMDAfoog&&WVAKJEIHeQfIEBo3EG=_!vBt6TQNYe~sj zTV$RCUamp;Q7&IG*1rWux}V%8_fiBWeaIPj^oox~8VMtDM`hp13C-Pe@m#jzQ5Ste zXa2m=EMV!fOhJAtHv=}^B5fL%ZgK+9S3n72!2;WlU+YCdMarA+~3Sv(d;AFFMOwX@v?KCZ36J-$K^8e-#6zc#12vh6anvQ`OdAF1?t zN1C^Ag%qXak};d_y)1Ou1s=#BeZRs!T>k@;^`+TH7WB~UG46Y|>y5~xRU8RuWZxedJ0BESG{Y%%=8p`s)X=TD638|Zqp2g^?+7hgYnAj^=;r3^&> zhHg>0>sb111c~gSD}MR1^C?gc#fXFNZsn4t$Zu%*XwR*hh6C&~!-C9lhC}M)cZAXs zXBn1bu}!J-{J=Wjrd!i{L_i$oH$O{c(QE`{%x1-jiX*TWAY(gK=o91$WYe(k6kw-U zV{e;WV5N~cpV6`V`K1o=k$>WAmXgBg#!}K*bfWR|rON17^Lq@}ClxlZ&*y9!Q>Gd2 z8e6W+0v~3MLHBYq#ZRM$w-zrFZw4h#^>92h$&eiNCPP4ba*H{$8yY9NC$(3y=nvSl?JEp-e_h=B**_?qUGg(6|ym?lyN>btw{ybFMj@I>EeP0t|TOISSF)lW5Bc+|o)@8I8+ZxZZ{?yR+gkcvD zjj9KFdiG$JxzMWi5a2aC1ZV^-c|$2n_yYNEukYmm6Lp@Htg!mUd~0fU za=%)JQRCMdJV}bpCk7W}q`XZc@Fgx+cllftx$jUc=aa%*m-Ld|b1IqD=}pb)BS4&! z?33&&W_p?YB%$_eqN9{C3^rlj_457 z-%2X154CidrLPjZxqIDys4YrKLUKTffog#c#ou^@5n%8i7Mjn8sLN{DNAKscQgL;2 zHs`}_M1_nhba=_^VrD#QZjh-&pq^14d(Io~h4J)K<95Mw)?zFE6$WpTI=${7wCM{e zMoGskj8c5+vg`%XR7;GE=wd)2G zIp)@%NqN&e4v1!19+my1IF^9N)v@l^tD|aev|%|s<_XCS{)-;GWWsXY|`P?Py>dASVoeF8%g-Mj#*YqraPV<16$Ao zI{si=m=*Fd>CBYOtL-D|33)yk?$yhPeHXacAAimhuWx*#h3fDy-wB-}l%iGO{g-h! z_aACc)@MPFE9?ZmxunT%r6RL<2~^GMchPlXQTE&57G(=Vl4fPvvrl4!{T~E8-J%Hj_(mJUTsny8)%!99Sr5)!(_R zEPsSDELv_}h0cj~bUap+aQ4x%I6(;hb1IIN@+tC;a$;C=1x?YmlSM=v(r}T!P$~SM zvS{-Fg*|(pW%+I!;+Y6U3z_6UwA@0_VI^Y>YJHj|qVvGYpNGPgbDXSMxaSD@Eb(8G z%q&ma1rL#e!!s(BGOCT(SkZ?HSwfi%QQa8$BGly3N67PsllK~!rzv+c z%`7|E;T*q)&?=lKhmIHJy9cC!c;uaA{}{X5lW^KO#yO6pbCEVBscA=MRCG2q`lgvQ z0~yxgXA|+PQvtMwu~C+LPO?4FA+#fsR0~BjsHXq1IVJq{?%8hqss9J17-FB zJpQ4Y3KBzE7iF+zMPH#S?m({9!I_ZPCaHd(y8a~ac!lv9q?MsFF}UK6ZY-DD3iS(A zW)mFjEa;?fPWAc_LSoNc>5`N*f7$#Q|8=KmQilBQCTlob>V-2*?D~n`gt4NabFb=# zumr2BEBbtV6@j{0a{;+@faN29PrS$HnGvb;5U6d7E5uS6yX)0+997hGjL<%GJ~k;~ zy}R9sX_4F=Ta~BKF4j(b(Z~~1nM?E(8FfK#mpyD|PyXQ04i{elsd5(lsPBoS`qJ-R zud$H zMk)}Ew1{}PVin_CqLMdZh(33%3Dhz+JP~>y^~_gu=5B@cnM7q7*83&hmhJ{KWnZK` zN;Swn71R6eMk9z+*war_kfgvFQql8x)rSNk74mv|C;7qS0Lo4!qn)(!8bkm%VEnUP z)M?3Hvlxr-k>`;isAzJRfWXLjygh zUU(X3uj|G7RQy4duxtn%wxf=8#fle~{6-jVuskh^ZVbT7sh~q;3UrZl>&)2tg zp1%E6e)n$#$)A%Ajj&dsjBAlQUXDp$%Q9Szl#$O^|7>1a1^!9JoOBgdsl4|6j7Ua9 z6%E#Z%|#oJfnYD$AzYw7DUeWq3g?tUh_^#maG8DykuU#MnG+XSwRzB)IA}BZq6^u} z7mp7xw~_^6Z#$aeEo#X_OMFSiG6$!uAhQ)NqHun3+qCI!L5c2A{ma?NqQ=`BxhB4s z>NY(|)P$)QR*>yVtnN+zXjD&h&F({vL9q$4K#-*U0(c?57)bOsKB%0F2&E3!Qt3LKW7n}f>C!u7x{*2@R(2L+)($K z9ryRH3yV`-6z_VNX%0OpofFFL$+~TntmY?@S#i9P@{ST+^^0_=lrlDp+8EwMGZxwl zzVT9*+|v7vP^Z_4qPuG>g?k*Gf*h2TU5Y!gN4!l2v4k`LAD5hV!aVrt+5D zgccE~h#*u>w!o*8rOuqw53Ct2g2>Ef3-G7QwIM6I0gkqp=8r!v)R`?kbC%2yCu^&U z%;%#Tj4!Vpxrl)+FBdIUE!<^jl7l%vQ42$hTq?EBRylM)DNnl&yzIu1huQ6o9K}+x z%V(53=W>L@A=X(^se|Cp@_K=J`Rk*p?CGV)LvOkb%GaA)Y)@Vt>I$;|Lxr7~hSJ0N zY=GX1bC>R-1e?jqcQxe{+8}(RY(vi=G1hmkNv0KFM`nvoVPz~IA>YolhwGbf;hK}F z>hrteK1o0JsSJ$Xp(4{jDdOaUxtz1pJ(1F8d{+2C!IlkH+VdkcS)(Q4d^!O-@8gbN zOr#pX^FP5cQzKhGnJWQOCwM@dAR^jDN@W^$r?s!ErJ=9&%E?t0yAg~HG@nazY`&zR ze=<2@aXw|^c6+3cBy6|xCn-J@DqkBAbbAs|JzLu6FeoNeU&L{cmhENStE|>{vRRCB ziv*{@_8=t9ajkdS)}W7YulaM=PMw7a*U8s-z1rZQWUAvzo?%OlLk?|ACJh;jp!g6S z=ROID3@OZlLYnIcvl0I9;&7@^P?z*(i%S>>{M~M{zy^`*{w%Rnm1S`?qb2t6S7p;|OG@ulyfT3#pXTcpMjMD=7)Qu%uk_Bvsm<5^Dk zr*7GYPUPiQW+Culighn>4}?l#@vy;nppYhhK2)+{rR#QZR#ZhBjY;+M_9!GiNzV?6UniK*i zYfu~qmDif+J+1}R4>k+8_@;UEJ5^5?Vkifc?GhsA8(6k0Ak*s#DW#2bB zg%u|jgj9iZuU9+=pYs&u`zfFt`Q{Ln3T=r%B^He;w4E?#AS8tF1*x_G$m$(SkZ=-v zze)-2YGrcwfA;=y`$RD5=mAeswmZ<>i2w**gGL1thVOxOb0S-eFvG!$G}+nbfraJ~ z&tsnKmBQO^_0Kg${;{~}JC2sW(Hm&vKM{GY^1SsdICt8w*jNw_`XsFBC|v{5C(9=k zC=53)kJK>C60|?eoj2ME7L#vxeMFn}8tlV&OCPw~8{zu==|KJi`A-J?u@r{tW$r_k z_-P8{MV$b`}nVy@Xw1{ucNO+2i(4>iB;PspurO^&(xY-k8At z^Wdg1Aw?QCAqP&v$2OHc{;ZruGw}&tsnkI06aGYvQ1gE}@vW3iv}NDDQ;}@gzoE9E z1A@KVB(g{DTnii#IHLceYkuLi<(hkkB$1i;u}=RLdX;1pERuJR;RV@G1O~P#v!YNWGP1b7)>iKrhQhVilGEK;p_k@p zvb^v&%Tlvkl&OMBr+qAkS9NKl@5oI+ta+DkVrgPC1{vmhcB^$WLa`8VB88Cksx3y3 z5l&7-1fPZe=c0_-YPt&K!Pb6D6>~nsz)<+Yf$cvKw+TIH7$cG1TTun#%D*y;8yK?Q za9m6+Mkj?ILfI#mKy!fq=c(J@v2cNwX`V*E4xaDs&UB@(=h)DxX+Mab^9zHMrru+H z(t)LC9B5CmV@!>4kM1o0{jv|1nrzS1s~U4E51WwZcF`7=3P*|-%zULN%LyZ6D&gKd zI$$?unEEC6jY!Bt$7l2%2hr&U;>q=7vZ)x2`l}}f(=|NV5SoqvdpVLE{^>T~ruUpm z4Mbc5o^-&)BazI=tWqztEPtYEu4GX?CT5Sk=JI?L9HLMV`MTzj41T$LU>BBi0P=-J zpX=}5X~yflwhZbakWG_GpQ{x`ByH;+Z5N7K?4!qp1=~(i%NMNkO11}Z`IWbgbbq&R ze!J@q{ILB3_xB)9Gw9{n?= zp3Z!UOT3qxnhRN#9{%}(bQ~@LQ_VCUAA3HBb!ucU35V!0q0rDfr@5}EpuiQ{X9bV# zhgYiw%aDq;^00oXy>9ZU!6O)c2XZQfv?6Xb6Wt|q;gr|FK*rV`vG>HO#j($~!sq>L zNr`kNwAifIdP7tW2JD9Xh9zIy?ap@|6S0T(VY=%$ zcX`HCNf~MFEQ|LD9`pNzzhE^|Maf;i*venLghx!^O>kou?!@=CElq#>x|BI#0bZ6&!a=kzLwZ6 zt_+H*wy~}mzcrvf+L<}9ku9vzGoeh5K{YB4>*ci2 z<6&m$jgWjjOTzIWBGEG_dKcRd!5 zn-vs~l>EZ9f{f(cN76$4Sl+@LkbKIyuJwbmClB3qsds`zo^Rayv!my}-0VtBn^|#j z&6cc?toIvUYK$Y1=VLTkAGJ0^E~f$v>bY7I89Btk?-OteNEJ|`X&4x>iK=#6u7%}6 z#h0Ibh6QA!rFS*8fc^Phzlfjb?*sZ3@Do7Ttc$!C(kbgBNy17u<34A4^1)?y!ihTK zUZfsD#1cBN>N*8y30=)W?oI~Jx#I0GEl}qS=hjiw&Uz`b>o}tSnBggszMR)EC%GJ4 zR{fJ*vj%B1>$=04UVw&E@Q~3=HD>ZA`!&Tm7v@9F@_o9+B~zM;;T1_$E1BqPPJRB zAG5oSSXiGr&~}@tqd0?48QAU4l}$H6mS>az#fW6=&wWOA0OY0i|Kk9uesR4Wy~sSv z6vg%7Qc4G`Hg&V?045znJL^*oM-^u;EQ8XA9T>9>T_Y;t@d(6+d>cB-IRa6cP~-PVsh zxP!@z7|NSv!N`Epbk8LNqm>b>x%*CpTn04tqlma(?Qftb7Dio#GjJe`o_HT{9qh1P zH;H{zS2EH!yAoUC=0<%!?=u{Bd+$JwH@@KV1~ z1{v}I|7hLS+rNp#*mBNaj*?^+T|ji_N--XOnEUX!4T2_>zIWBKnFy%XGVZ%+DiPv2qS0ukRQ zu*`zguQW}+*2f;8Ur8Cj|FPI?cXi|3(ti*3D37dJT4l_SWZKpn2~m{B(l3scQ9`gn z&gD+kU{WOzN@@TW7$}f>R@nX?6mtjgY7+g9)~5L^Hhh6lCIvPMg*dBrc#jOBgg4!3 zn$?9|oj$M1h-Ta-aH|uHl(zXk?TU`&X(bMo9lvs`ozb~v*qWMiogh>lciE0<+e^rF zw6<=X8G+o_fXEJdwT-0c+lDMZ`J}P(vM`TBL>EBBdt35w`B4^pn z_Y1s~hWpg<3DabnPwKe(s0OivA&K9Nr5%XFOe(H($M~!HBKI7B%^gZV1KD2SUbAmn zBRgJEnMLJDqf)HNMW2d8mR#GvdTT~(QRs0GY2MuXv{^L}XW^eza?RlTb|e&+d@!NT z6h3pu9kngcoXxJX!T`>?ICUE+Z_ba(lCyC7ioY%wY0mFXrQv;JRw4fMMNcxgJW_eZ zR@<34rK){C^af!nf-t&>?&4+1+}3}yS~s#6mXgj^^ap+B_FC9VvRkB*++ywK6S|nY zTlZzcYYM(UH2!nhoh`GfnSN{>nMnp0vf5?EqqMt%O1s!QBzmjzEkildD2|s2dqf|N zOM17!qOrRj`{wkw{T0D#%v0X{p7ETQin`NN>G_3B^5024B&6p(PiCaLL#*7dDO_Kh z2C9?Y8MMpQuU5{Z+BR71RF#%E^NSho81tC)KTEYw6q9vzoj3}&pu%7KZqMjgqgsG!(dq_eL=Ycej;%brhh zw)`A)UVSQ@6_N6VNVV_&*TG_#{i}*HS@raX?<_Hdrug3q;0Zc}9xP|v!UN;tO~s+H zOFT%U(b&6Dwf%9jRzD}7>1IdJ7AMsiqEitGjazbO1`dz~4Fq0~KL$IPpT3t?P&xWiA->MR23GkXnJJToKC@pVA@d#850jWrt0 zbX!A-K#W8}PP0koN3Q1?-J|ARy;(1{JA0void?sGL_Mqj3j47G@8 z9;{v=I1j1O_=V6@iW2(2&usSi#LJY%!xw65zBF-qiFi%Wj`Z>M;HahLqE6anchvHq=WG&M@-*=#vMH+(OmE%AHl{ zhwywX==5bhwjH|blr}67B%7Iqr}MYmYNt~D3Js#xY(~c`E-U{Zvd%KB&2HV+Z7FSW zC{P?)+}*XfJHg%EUE5OJ-Mv_FcPMVf-GaNjh2*5)THo3GI)@+mM*=hRo$@?m+@pt$ z7TGSZvOBkl!mRJQ;)-_+s@)%R_dG8r2K#Lh=bwAXF;Yl=aiY1diNdOl&^G(y^wMSE z)!$FoP^uR^FlF@Wg{D?3`cF5^3P4^oBpD~EVAJy<_sdhp2~qz?4>1I2_mXeYg8RE1 zT>ri-+6t>gzm&?NxI3jt3qMe}}v)zH_rPFdV@=FU+@K zHf4eBkLQxX>0q|&kf{b z0$PJzG34K)z&R{vHOCIl6o$_e! z*7#3RvolCpPb+4)jz8y&3(s(vgZF-^*4f3YywCv1IIFmTdP85AQ)%yzAip=H^A(Mu zzljuPf(RW;o%XUrBq+hSSuwO?+#Y*M%YAw*Hs%*xd4XBBn#W=F+Aro zGD5b5${SM2+HItUS?tyEaS9T|R|T_f>c+~wnFs~^oNqUu+o!~qCq59#7EVV(#ZcX+ z$|pja7J4$*m(Mi0HQxm|cP4r+yDRlc=vXU`t)>L%-K#fOx9^znZMCBVM@)0^9 z#8?)`QblWUoz10YD~vN+;mczg9Frq>dV?e_-=#o6zu#;;)rVkEK-HSD(1#Y<9k{%M z_W7TJ!lWGxZ&G+8jDqqL*J#H~v&II-rU(yTcysE{oZuh+i7*vd@n_M|sN?GhsT@FC z#JI3?+;wV&vC7Ug%a2rH(PL)}*-h*P1zV$d_*2l8fw$|0Sb-OcYZ$wIWx~Uw>cSD> z=a%no=ZBa03}LY@hpyAyz31Y@DobAPc*SvM@ClzH@+~^=OIoUpN{;ZYT9yyoei`ia zhA&_P+|GU|c<`^ZelJBoTiY=J?$fJ$Y*_beXm|*@wGTR4RSgKJp2F{Kdr?kF9HAfN zY-RG|qI+}*(ppEANVr9QP?vpif-M-~>y}9J7D)!-{P!n*US@7tGr#QHi$TBD<}}!F zb>-dMrc%;ZceEmq7$J`z>`{2xyZ;B|=jgs>RI(h|zFAa?-IW<( zqOP7eCvckJzevx`4A7XqXy|lIZ=9?|WI-r|ody9GP^UoS$EZ*R97$Z>#=(iDqOwkR zh(yxvFX?h5RLWd$W>mv7kyEM>g>;Fr=a1#MD+*wIG`{ysF-QY3nBCw5 zwy2I?Z6A*#XJICN{7y+b@gzQy7;j47SNwCFyypq+#=MWJU3!w7bfXbw=sbYCqz}%@ zvSanfuH{rLu_HEI>O*H2aH0SuiIv`A6VCcOXLEgCt$YO)+Y*Pbo7M&j%Ea^Flt9W} zL9fPh6m!#6LY~My!wy3c58}HRTY32Z2pxW>yfX&ci(0pCI|S(5^E3_0dn~;P6Gy@- z*TAn_Kf_FmUGrbhOir>B@jyazsV%!1;Ns4YYmNtgd>+YCx3?6BWDPEc zbs@0YpG}>9|A=k9-DPk1ieOkeJpx#JCFY?Jghx#`=h&)L03|Q(6CLgQ=*K`<@#_;q zY_;Z7Rzk+{CT6jYPoFh5fBW^-@g~YkKd-IpN~CGGadxFTF&QgwPoxrtz?FhaJB?|F zhB9t+Ob6+Yz6DHV;Q8*t@ONz)nPK*kv0M-HpS?^Ok`5r|9-D~%s%QW34o^tljVxD6Er{17q!9-oFPd1;_sblC+)kb{ zkH?0l1-W*;E1eihe|vE2Eub{VEON}35f5mxa-)kP42y1&8nG@a!6HeZf4YZLIFdGQ zX7F9D(aL|qjrkCPF-2SsmLc`#>36q&pEby&{Fu?Sa{tc4eBWQdyLoN;aYSaVZByF{ z4^mdc|6h5v0WOSuB#zB@8ktitBjbj|@7P1)$yzggdCzD04#P4b(tjDUGIoHpdX1wQ z*VTXGsLdLG+GJih(V%d$Q+#9xZ}v?M&LhOh*pE6Xv9;oN`r+wHQ{wliN~z50Bc{1+ z`E=(k&U{s}8>3SkIitFjv!WlkcJaIESpnkN{p0jpPdf%7?slAnk2fAE=cooY312jT zHBfx}Cerz)3Vio=+h5{8HKC1oQrB?q>1V8{QH$-uJ0_AlEHVhiNi66jEK4jAa_Zs^ zYuBbJ>zFxNQ}*L$GgcE!XEmW?6&?&~8fq%Po;c&hZax(_(vVDwA$3%FD#j(-UhEj_ zFa~F(jH~H7TCJmUiLUf>IOIFn!k&?PzBR-76=c97l^j_GIYnV-IRUYncrS_iTWH;* zZ_UDEfWQwjJHoMdb!hP1r?3m65L2v%Vmz8;g~5r%m-E14%S97l&dk<^-~O_7Xle4} z#g~}rt^l9Ckc#PFW5jQ@j31$h0HMDd@ zTVt6LEDv}J?sTYLMC(?p7H}VL-<6X|$t_gy82H@vBjy}BmB)CHYR$S%&5+C9IvH!| z*rPj})78S=XNj4OYN--{1yL+(V8AaypFoG*y%++vk%4I`r&;D3v8LPBOiOu+^%;MO zfA+3#!LZKbCakoP#;8LCcAq|~cve;1_>LJTCnk2j(jr{!<)KVbXt`Z~nvbf}e&zA} zs9=H8MhY?rzbeX)`v81`6OW&k>X}W2ygaMwbG7Ev%9QEI4UNoDTO-}WX3W#2>htXv zN=9+jX{|0MN4L_P!zs=O^LE@j5_eR5B`uJ;z&LWz>?x+=q|eqnvt#MeVHLFlo6EkE z?hu_oh8Jz|^79R&qwz9<6Hw-Z0T`~^BR4gXyl~5p%p*cbU6q_Vid-)0kZvwTMSn?Z zJ}4_bSi8A_$-Re|;MPwo5LvGmO%iYc67=|SFImB*Y$MW?Do+~i*E4&{Ru^sTP8A86 za+8vpPNZ56uH+2Bu7fP}C=eD!WMSFyJ?l2E&T@P&Q*6KJQDr}IvRms^suRcL^Z1o7 z=%+^Gug((E@t!YCW*`66$2Q!g}@_JlOB$mMrDPlha*D(tD zy(BSleM=+iD^Dp=YEzq+okeH$jO3GmEt&6Wl#L>vx+1Bm@$*$S&iAba2U`dg2%5g; zE2+t?;S|Ike%roUD)+};QaKq^>gKjBZ6q_?K$-af7{ zJyVGHJ!xv~_nMq>(hcO(MoQvgZQmzErf=)@Y|Z*U$xc=@=E9Bgv@Ne5e%R)L(l%$( zNuDrh495E8Zf=^59p@{R20uX_-&uMI~{2yrlqz1&(nvGSleBcMWt_WEIuPo!tW@lfS-ey-%ZoUY zZ2advO2ev=kmpaC3bG>LFW;2gdZfrf!Hs7gSNnnfF8Fl2C?!0qf_u!ZwX z7x6brosz2Bj|^G2_1%!LMKnkOz4}q25hAcQ#@>~&=7DN)MnW`<60ou%EvIX}@uBe2 z?iQf6B+W}!J(2d#_+&bN{EvBSB;<)xwamS^r~yE~CI@K3jsz6dLJE8ddCk-ht;OKDlX zGCn|Gth3s-U!Ks_{C=0Nyz=Ox)wR8|+jk#-=UV1$9@`sF29uOAgg>mtjYa9)D7S)A zEk3^qb$UJ`xtLVs0ks878*(6s5lunbw6SLDV0Rak{yes*u^OJRH8r;NY$*sxiUZ+; zZt>FB44>^n^+g93k#Z`7I$lnm(V%20no(S~#4yim# zEDcOMF2+{!UYNRIrL;qC4l(cG@;%N$f1NU~4vI#uMfd|Ujy=WR`;?iSN zZYe_z%K4a}Dk2V*tQp-W34bDnJl`Puq6cw)=5!iv@*`y>4t<(;=wG9gq*HE5+Mzs# z{%*OE3<3WNNOQ83nU#-IR$A{2R4NT#lw)WW8%T1U;(O*lzFKm6x8Bs6G-Ju}f(@mB ze&NO65p4gcEMO5QQ(Rv?q$thU(8P=%M0sr89A)4V$zC@f#vdSU+6vxLu_Vg^a%Lo& zF37>jY>P1PyNLUf%HCpkEP485xhDlYsvjcSiMkrpX9w#TH}bJWJ*v_aL$NQXV)R5% z{AKgk!p1b&0pQ77oBV*qTFgrG#iJv-{9(ojTX_n$+=|xpQ*Bps1(xX?fS#To_s;A} z__8q6in7y$Ju0w<(3bWmVjq7#=7!s;4nXQY8Y4B$K4YJ2S znlqowGN#p+_c6YA?}WU;x_;9S*vY}O+0%jz57^m5J-`2|alLuK{+ z%4AjqUk2E2zScAYz(t{@+pe+(1cC*+dnA>C{?7i#=ZOY{Zp)wTWR+_Ldzmn8CE_gW9IA}v0R_ncVQ`yrOH&j3ufuA2n%kXhw_&mR)F#qxNJWoP zi1EF6cuI|T2g_WB!?!1Glf%i>o%bSSH6uR4Qy$3lH++W?e`?ujDW1b7L3gy3glk(% zhJ1g_4~2ReO9CF|n+HYWqr=N{nk2fC?Y1+vSVgH0RUsx}p^UV=3{Is_Ql;Ue9e0TG zh`>LBehstk%g7)i17X;_rb3eqg6eLlH$nD;2;DyKy4OseK4kD_D3G6$6jYt{yxBFbX1SDo?Kq!49W=l>rVhgc;Vkt{M)vbH=+4 zmvje*E((ts^B&79;!-1#YLtSUSnoJ*q=C%q_Sifm93g=F;&Um{Dz-G9%hy6}^o0|o z`#pb!>;sL*jSoQs#n^qzv<23$5hLC(W-Xc7lncM3U6e((bAM#qlMVd!3p)$tn&%#(OEauIiFwCaBX}kXV_Zs zy2L|;5_p;J-~Hx`@$!%b>I)4k${@Sw)w!P3ilWs2ux52aS$XN2R0wE-DCr7Yja4|? zcCl{zqIj&b@zpQXMzzY2RCck=A9OVwH@!7qI6-X`g*7aV)T61M91y|MYO)RHM?%Ag zKiX`+&B&w-)xY69Hw6F;Ll_yIpFI@blN_GA5=zcDY6FskW~Q+>ed&7CYG$JC_0#zF z|2#EHJuo8P zS<0_+DMCA4`))3<&FEqn>~K=0XwtH)V^N`Q3j?xTB#Bqr#%f(m0a`cvh3F9mib7M4 z`pP~LrEb7;9&oD}N~0`zG1jY89g=6IKgzx0jQv#;Bo=5s!g`IhVf*Bi3It`Ab4uiNfW^j`La6~I6PMj0)yW|~{^BqFs~heT`brrpnowGuin`(_U8t1f~G_pc)Y5GjaR8`ktp&jME4 z#XEqUU-h9OrI`#EA(;A!`PVvn-Cj;qn>l`!{-0C>CtQtjoCxP}4T)19ZqgUpVhWNSTXmZ`%WIkc$KAO10qRNIZ7IfD8G zS-~iW?aHrLQtLs<^_dnE*+Qci89JEE2wy7N7+yFzSL!m=*u@e)BLIWLPCs?ASqv4% zxk%(7fEtF)O7nti>ksHtc}MT|Hy-3bvLb$V@LQNx0##)wS#nU=;ffzV$(hcZM%u4`D^3F@(Ih+~V-k#nF|&#;%Wy45C)Qo(yP= zxKRsfq(6vL=hWOEqobo>l9uWzUEdt_e+h1?5X!-6j$leaH(}82vCmUXPegwEt zrlaW5=LXsDtzIt?fPpu+e@Zd!=uvzYEodAJfeKK+zfb9_SFZI{f&q}o@Z5q#bE@nQ z(QQvuZEr~AWpw9l^T$PcTdvt9P}?!-YZ$(#O0L@2%qrq@WGy6RUu$~#DEIL$2WZsJ zs8p^Q@f)pIY<0Nj#VwI*_ZnO=b|yI9*xrQk6+$;vw`Q!!C~d|#(o0L(qA4!`(?6;m z_{TA?KiuE;bU>HG2_H+-vtWuiK~_GYiZ|=chxM218B8v-BnC`oMv|2VR)Y)S#;uPK zR~w32-OP5ght2fymLtOV+5cA{6%uf4pd0kBMv5C6S!h+M;NRW;>}gns{ZW@r`wvO8!Sh-H31?APk$vMA(--on_ z;7fHdjkNgQoYUbLbOwYmxcqjsi{go{o0F7Pu^({Ij1~Sv%1Jdfm^x&>4#qn=os{au zufhJLbv}z5y>eHTJWc(o28AeHc0?%lei*3~Yo*7Ck}I6%dzAJXgYUrHmQ(el_CXK6glo9j*j@c!J4LifzE>drSc zex8Vk+ZEsiBViFddR+pUS-?4L#YnP9mo!de-^nNyCUfqi22L>8yZH(W5e9ATLes=B zn>bJ);>_4()Cw_A$KaNpkUkD-L8A>%0k_kB%v-?wS`(2h!r>xyQNnX>Jk(y`MD~m>?2ADdnmDH3 zFbqOJpk8U*;S=%z{R#Arh3gVuA7MCPnwyvzQ!u&WDrqdp3{~KEqBM&8K28c}2O}7_ zZDx8Xxk&fP3JnN(R2>>#j5a)P)PUc|+udM5&TQO@l^t2r%bMW5`E*Y>d<|<6%ePuuobMhT*zBQqJl^(S`k_s$F$5;K% z^Z`%x#l~TgX!*VCQtHuCM2fpjNAUEH9~WN%Fnep^_fR`H#2LEWlY?)!ykNspK;?}N-B9K6NXwXC8q2fEr=3)Ro$agl z2Gz`lbu6?=X!5CBCG@Z6a9BA_WD^TLkC=Fd@*~{^-R5+vqCO#hy>_#dyFDB$D9@Bl zAu;FoA0%((Ib83`Ne0Frg7Ik8I&+eA_b%;dzO+MT0OrT12Kx7W(EuK4`!6ssa;M7E zjgGy^7IOuNt58-8uXG7vA;KZqpZwVqW`H+7LH9bvvvz5~db;nX%^vU`IL84Y2q`i> z68YfREI2erGx6`MD6m{(-(&E^dQ^c%Sf_zASW)$}i6KaKM$ms)(5k-Z^iBxN(R!5H zi^f9jMnx|!v@cczNWNtK6y$1>w3>fG(112R2a+{8?yQuH1ENk+TL!hImS7k9hUFqI zs#n_oCLUPIxp9cQ?JP`&0FGk?Ah+$PS^k3PaVb#84*-x&foicS%(q8;{xsGOV`j%F z6B?OVEt0{y@=J#~?fkxch`tE4qd6FQnplFgRa~fW_gbidamoFCk~TJb?H_z&hPmbE{v&+cAm;~+^z=_;PqG@BAX%$%NmDC930l29RxjFp`Q7@G_`G! zToVN2CCoYD*GI0Vx0_a1kc*awo*c@q4cHqflB5+D)<|;^`SuXqu@QHO&iR61CTke& z`F^d;Be4lSrKwe%SFA2REHb=gurc0bd23P^V``MS@qE{i%A z%0@KpzwqR9ZLb{$gcAr-*6deuwO2~wam}n8q`sci{Vxq<*7Kv_oJE}PsY#mKktose z1_S0sqtt0}WpP^$fqVj;!w3bn9%s!+6;b=XhdHwp}@3sVJ3^ z8Etxa*X0Y~)pA>HB*`kday~$K)rmaay26?x-~%~vZ!Mq&z9H&y?F7MGW;HzyR9nPY z0x*be@YYA?3y}wf4-Z@a18*m`WJjKBz#p?*A6reqJ~?I?PoPjHdbOtErO-nJODk|$ zG3?Ndpag}a#sem6Yogb<_1j|lG=pA?0c?`^d%I-2XB2&s^9h+=FwPZK=NDFv+E}|Q z;mKwNJ6uZ>yWA4zMKEE}0eLS&ePvA#Qggc?N&mN7AxX^P{1e6eU2|2 zZ1m3 z-dpQ7+jS@|Ewf)aU{@-#gFL}NXhoW4Hi9n8JFv$c>ds$zA`cHdg@nsqD1`198UA&d zMU{^DrALOp1#pm2sFi;aG#`(Cr8CN6S(~&Ie5oz82|7iSY1MhckZ%}>=!w)Q2|X@8 zF#IYCW35<9Q1Bq@{0FepfvN4dyu);b*1gky>Z zl@1qP2$`+IqmJNP^fb(If932qCC2B81>4TjA}Fb`UMS{u+z6gQi%7wPjqS+n!miy- zS^C97KjVK&B{VNX&gwrgf=v33)dDkM_DT+jKa`o)1S@jS`VXKW49N~|`X`6FURV}E zw*)3ETOk=lktbhU-B8u>DT0!@7cK_BEZ5mTcYAlHByuwVvDBrRK1-uTPT1_%cu}-k z#2POgZ^SDiG+0o{WG_nWZ-1lU_x#$BZ;|0QNvVG{jB78xNqTtYA~n%H!U#m=VEL}j z_%#n(AT&JNe$D0Y7_TkOG=i&ZkN{;kq|gp>e9pVJEg18c?V}gc5qV#1 zm{u*)SV9(7(n=Hw>0y0TT8TYI5LH4dg=Mh!B0{CiVkV|Q!cdifb8Y^8+mQl=zTn*N z5jOI$eyE=mV+KYoX#6X@L34O(+A-*-Pn4ftZEIxO$IVk~OZKQ$^3wky-k%rGJ3W7{ zdEIJsA39u{zmuMXs-B52D|jtEv#iin+u(QQwgNGWQrMk2^3pj(f5CN ztp0DbJ*8q4AhHD1b0IHqU=_oe)GqS>Qtj*KDH(D9#o7m*noq)@@)-}}yLkO29eEQm z&)ig-JHjX0tYsZNmt-d9U4kE0;Eh6R==+QJdNNvf4DCC-QU~P^X?KuiTuE*izn+O} zUN?^L1$hSZPqjB}Q$0BnCY*DuT39IyGWtH(;n=8cNExmh)i!>j#M?nTQOIZMquyU|@${_v z{az-(;`2DIA&T>&AW<@_YxjFWP5N-ejv@4x;@5kG)g+n#(4tQE`m0i20uU|?z`*-& zwoC3>oB#xcn7ulqi0|=FKGAbebf2A22pO@T$<>n1%(wpwIFNDBrWiBDQv zE>i)I(?rV8F#;n#HsV_Y4|$*8QZXZX-g8FOGD^N}NuM4pT@2d!ATQdC{b1*cS{$jP zjJ^wGo%%h0Z%P^A8J>iVvuDNYRw?_2w+NR7&&PEVGL@pm%ml-a|5@yEbQa?a0n?LSmt5F*>6aJMlT z8e2-Tp8J5K8?*N=ZBiboF$HTDw0Jy^VtRW*_Q9AItzs;f-HT#hV)zTIzxA&Z;Fh*h zzso`bInn)*`(WTul)vR)n->~AN}(v98Edv>agj^ZS(e(?P8yY1V}t7jW{jx@FZN82j}x1*ypp)aVR)$ zCbc}fVb5ADw|E;bd@Q&_9E?HU7OCcHL^>)GzIFeuqpsw#S);<5dW$NHM->*9+;3hx z^0smZ(Vf+yOw+8euU{_H6104Yd1@k_BVE}PSwr9hhu=yPy|IErViAaE{>&ECB|PHX za{?GpVC!42Yma!}J3SU?7mv9* z6cAp@vdvl!vmklxKHX0VwaY4)q#h!c8Cjk^SMnP~pk5aDz<` zQ`28TOsOZ|N7R(%KplHb< zslYMp{_>dyzIX{xk@{r46D&yK?#iuBucTB;O*#A{zeV~}ShHRiq{WWzugpZfDDm4&a);5JcyUfb zTWs||PsvSj<3bsTq9o zDf_$`H4&bFaw?XiH2DoAdW#cs!T~BEtD1ffnDf?YpJTeT_wl=NEPI>0iq<*%+WQ~M z5A2BWwT(mn^f&Q=O9F^lXx&rqS6uN%N)h5JvNM@&Z$gK=p z$@jYlsq~2AGc*TBT`-vYNdt_sju+r*6+cl_-sB!`Xf)B*zoQ>@xwdoEZe?KVZ5<1r zrndVaQ9Nh21^b%pwgVYH26oZzoA2l|+18BP_`aApnCr#VlHrVn-7V`CugO*XJg`I! zl34#%!9KT+hJI*mc#UXEAx$g+!1^Pzcc@}v)w6Pgu_Y$k$Hbn$8E=O< zOT@@U>@>qtcfkMN@hVGj?etF_LGhiJ@gNG{1`MY9BdfZ1V3Qr{lDp?>s4+AbFmZZa z%Q~Gm<8~vTRgIbD_{M-fDE_bmQ)slOuip>{7qVI&h*Wc$N3gVkqwn2Z2cov3F=iBt7s zqOQ-LU$FW;GlDtl`|#5`g^01{7|1Co5EP@uM{WdD<{y>qS77-#e7S0E3B;9V9o!eO zr!*FT(Jv*aUZcFpx{)LDV-s7>OtDvsPl0aK{vCU0b5$bB`fkMhPsW5s+e^x|gi2;+ORsbkkA z#+K>LDS}$a6Q2hdfk_0rqI+CwE2-4qH8%gljgPO|6+7plsvUbj=DOZrBL9X- zqx#}%mMj8;>nzmXa{9GQ{P-zVZP&{ z9~V^UQlc{F8jj0U1PC0&YI28O9Joh_Zrbk=E{%BYe@9*wo4D~dZr`}o;%Dqa<7%Cx z$IwgWeMiw(Qt`Mn!%e>t4TB94;$kcD-V6j{7-{V3!e1Hwnc~tDax_pHMJ-1`&gP7d zXs;~|o80FX58foJH{lk{8@=KO`o7Rxe*5*dGnuqu%P`MELT#bk=eoV!9At~i0 z{sLiIZ@2PgkzxwVQ$1i@(a2`~h+9p&Eq!fF+hSA;W(vr@Kfs_xENQ)EevOSFb5)>( z@QwK2{I$t5{d@)J+jA~gJF5kIcoM!CY)F%9cR0nsKPx)Tab|g{A_1_q;FiUfqUxz@cQ%ZQWs;9#e^p)NC-h&8w)A9Z96S)OmT195FKMMu z?R)*G!pinD$NqH&%7Y&q4nt@R5!|N+xKl*2Tvs*5F+m)e`Gwc`wTTJ7vSLWMA-OLw zTE2|}bGm(NEPag(e!JiLxOEIzg@A0fg;YpdzUeCo8H^)uhKJm4#WgT6^5NETNKwS4 zpbKaDg>qCnn*4{{|3{Z^LY#(W1Qac~<6)blqnzVoB-~i2Z}&z@Pjdq$8{l?l6aNs3Ahyr+RR zeQPm+Lvb%2TBH4k93<*6a(ZN5oLxB*24VA>MdB$giR`;w#f#?LtX%mKsE!72(2EHBxAk9(C8?z#14+tU!07yH$xds|;x;Hf(7 z(s<7DRTaoN*p4mt>159`?BuhIQ>~y(^|bSGy^qK{RIED`%%f_A5^^D>#hzywMAU~E ze#T~D5Wf10Kaf=!UBo#*8B@!6=hKHSBqrZNEPU(+mga76;ZMBqW#~~DW?~wc#Qn-o zD&7SuszxEv^tfYbL+ktzF~1CF>Ia;{&ti+WGwy9A@I@EN?(wWs20W?3u?ktA$=PhM z!w#S9#d71D77lcLzZE9$$+E&7H`!SGAo-!%V_Du(6mw<>c`+}aT77X0rBuiYk!j{u zw*#l-;J~6$I20scS!v0@!k0H{eQVU{Q+QjED0MkX>JLax97c}_6`I>)bq!9V-W`6(C~8K2D;Yp`nyA8^A++vWLe z>hBh#nI#6gPMXdI^R;co+|+gTGL~Xq7&D!qUa|#~YY)=AR15H}{2(fg%~X7VoW- z98IIB-j6enB}iXj8m9CCI=V`|FpuuL!9u#j1^c+9H7S;<3;1_M);6}EnB4PGKE@?& z1WzUoiJ&&YJq?@S2D8V09zxzidAtIaYheV51-RZPU)Qkly=<$;G7L~BBrY@d zl$QUYK*E&F8<42D|9`3Z)89kX;?LZt6BKkm2RVT#VMKX30nzwWlxQyX0l5f-OII!~ z_8Im3zsP)vwyC<)GW#v+RN=d#Sv#BgHYVEH%abW0wH`cqLMHl%LK@#dnP&btyS}IS zQDK{c-E70=DPQ~m53fYD!tcU0E*d0LPW<`XpN`|z{*b%mYS(JBy`9NwvPH{r^o2Ck zoZ+40pPg47G2AoU>p4qg4?;A_n$|Bg;^<_y }Mrm#Bq5g)Zx|H21h8>Coq zKn+5Q2MPbYQH+r4)IwSY#hJ0Ut1hCt=6Sm>gx~J(VX2ihOAG&Q^ddW?M*#vWW5<@C zFv57_2U?5e0m`K)PMS@ySAc3dzwV4>+wit>4V;v`2cOGnW)%v)T^S6EJOhic8n(4z zuk*idjb}H&EUaNe$r3=5&{Hs3#RTyf#pLBKu2vHHP+n?X-`TJeFCY_X@`VGSOGl#w z;Lc2LfPNRYP^g5}pGK?$eYa**Jws#$L6Y^^w~?aeCyw}_zM3L6^JA}ro(Y*rlYdeB z%S3j}AW?Z3be8Cm+YR!u<*ta&n;@(O6ScqT#~UQx5366s zm!Evn@91&h7nOteM#Qtu;P3!akS+8*FEZ4A=D#3g1`3VLU)`^ z!nV2ikejs9m@Rs7dcT$Y0u87Zzr(ZQC{z$4sR~{!PxgIZMs-wN>+F;}{_Wils#~oa zY_^AddIYl&lp_gJ{cmHw-fcs{GFyHGhbovpkC3EoczjP~YUg)PIze4d6v0;=Ieo>r z@9mX1tRINxExrW?nr%4m@|L~IrGoWs{uUfx)we}eHp1|_d{^Hq)YK(rJ#d2}_)`*N zeG3&yYAS@d{bh5)tzm`b49!zR9z7G|;p3}q3y3dMEmH|&2u#bygvJ6`iBH_S(HYNTLjk9{rt7x}s115u|_h<0g_kmyKP zO#13uAWN@Afp8)|KT(#noJ5-a<|kjad;3U&qj${4s-D_o#X9!&ikf4ax#QRTZfy7e zfcA|ry^DNs#kME7FrBZ1{6g!}T736MEM2*L1M9Q&;nd12Fz;~B(j;e+PxUZ`qT-V{ z(zs3+vy&j4Kxa35AGDlCrwQW#TmLkWDToXfwpo#f-<*)CWLA zo_DwCj8j}+Pu&>CSC&4U(}6jVjJ&6A1S(<}H0DW2m9I}qzFdtm+(arllu1FIZlZi_ z6?w{9OGap~8&PJux_&<421TI;BRp5Y(mtP5oqv~XHeAQxi(=xUM|@^v(5oBU`uGw& z54|o~|GJxQc7Ifn&#T+_;PI-S>0i`l8v#osue5keFL6@ei`LS2k@t=`j9|5%!un&F zjoAKq%4+^*xa8Bfy8-DXW|zfDqu$v+yu z^4WH?%W%n}6S+iGR$zE@Z#P1Wa?Lv~I?|grv?i3+Q8ppTK5J3sKa$giqXpbW9j*2E3+iySZ*?` zD90mS?iY!XzS7C?)w2HDgnh3lxEwkT$L=eOWCwu3Mke=34fJVhM4f)D*1|Hg4-PBU zl0AYt(5PUE8LX!TQxi87t}9>`baBAy7PAOa7iH7I z$ej;KIwAO+?&o&jQfANZ$mH}78XR#XP=UL-{4}c@iF1B(UgYf#I@+6 zp5naZ+^E)6G-tVnXsiL_sE>y_ijR577(eUMiPf3zs z^u6TW0ncUOpJh1coc6B{dxKGkIJKUsA1S~C zPB|8XZ|xho7L#+Zv;S|X5t!y zg_@fuB&8aa>tEU&Nd)4=w|ffhr)ImqV%5SoDfUQR$P1sR7vCTV&Pm#-ec{Io%xM2vtwNC0cr zZ7t>($4#J1k1HJ{GaU02a>@*DAPYX^L(uXWR7%}99qa;Vb;IEKh@zJQJ)E%kOsX`8 z-Vz(%Q#EBgm_YZ{zYCFKptdSU-b|ODq=NbGDglr05NznFBC$`Xh-X!^1DpdJll=@4 zm!v|prhcxd&y|ZgL|EYulz9YG=^I_ySHrZ#io{L^l@-?D887pYHSim|Tbs{w9mX8)aKK10N&Wcyrl$M6)PoTQndNOIj7-t@PH0G&DPU+c6Ac?z!##-sNs{j+0QQLmwJGbFQ zSHvRy?Dn8m6N{N)Od)$gMhUhMJg- z_)AvfqAWUf(g&IjTMn&iI7yW=JrqsBtNiC5TzACQJ4D3H({REI&0MJ$*ZMSg8X!+L4w|Jt*+k?%dz`- zZ1#pjSG(3dOLqA@(~!2*Dj~%_{ajS>DEZb0je*kblys|$fUZ{64r7Y&lR|XYFEC;d z+mroW$k7p|1=P3OE6AGQ8u(VIOw{sfls=O8%80vZ$zwq{A+r9%6Wc(VLL;{k8y<$R zU^YlF`?)t^vgsX`9k`0J)#?gmf98ywgNvNUN00fhpzPOtj3p=UI;9bJ!S3q`b%&1M5`9p6vi( zU(Cv8Xt{`O9T;}PMiMs0j+Kp_v1PD8@}l<>JXI`}c+9+! z(9aT89E>H}&cJMiqMc;V5Gsr=YPlvpj6Ke@b7z1X3ZgiH!GtB5fdQY|ClRB^&g#O3 zuIP)rDGhOo%**r$O1{LOdjlmxzESx21Rqw~FH$iHI0`4(VXGAKzqgg_;Px3>nBo|_ z>8Mu$`giO7StFwp;Hri@E)>sMQpPWIN0l(V*Gbc-Cr{fj%vE+3U8(Pv+>Q-2+=Xhq zrh~~CZ_-q!E;Vwnf4y6@MxRjPW?{_8Nb~o@+c`K^Fy#-pqoK6!D1TUyRSznSByS#Vi;s*}9k{vz*A5QM@$#4G0!UQFjmRqSAtd zKkR*OTG8-Gf^MWZV-(v?w^sYSp3^_Rdnj4MW>FYOJxP#Io*FL0s`F*eD+#vv?TW0K z+D;%+1`yKJ8wI7!2zs6G*U+kM({dy@%%)SS-PRtRa1=@g0g-72_7VRK#J=3nTC1nh z12*-0D=8&;^>&A~v&f990%^I0hwdT#pK^6Ipk+s*oa{*zC~(=8amvA6pU4wYmAnkf z0la_q(S{sfPFEPq1sFyE9QDjw^&u>VFtq?1Tq|j1kK*Fm;w@f`JJ`kO2u=?oapYih z@84}7;_uYO&r56^U96}u%;0r$p9K7(TsR;Y^SU<&8`~P*w&1l1iRX=AbbJ;Sg~nrE zfrvY;ytVl^e*?x%EW<(DDsXu^Tmi$%Pj(_HNoRa2)_3GwWVSVJO%aQgdIDAbk_PVM z#Ec8LS>He10<<>|hc*s^2d2(N=+{41u#F$6A|*$?xi*v!iwOpLG4)g3}vV9fR|9=s8! zV2&yt`!#&0ybGbN#?;nO^2p<5-S!?=&KLD3#+CWEOZBlZuH!auLEYhl09gg95qw1f zUa~X2_dKUgV-tpW4h5qkbh~lv9>7N2zJn7Sb2iKTDvo-cqEfdMC~S6ZwF#>@bmcF2 z38@qFOULMpm1`fd|CVfQ{iTO=i&d#O&sVIzPL>B(Hmn{HXoH%cG#;kGE@nvR?N=Ee zXtQRM+}G+_HCQm~h6`yL&BQfXDE^4~l^{zOx4%Qd&ttX9;GxSd+ZIc&tQL91rOtw^1)Tpb$Xw9Fsk$1ryqLk=7a zLb|PbwNAYWD1_;{O?ST?s6L1pU5ON&u~d~~d$&|!q+Vks`x=v}=4foTs7Buf7q$1& z{Hh}m_%o~s8Y!Ce{~^MSe#aL!5FBC;75fK>ujdWuXYG&9`Td+4itvl$4rJyMLUKx& zA>v)Kae!_!QeLlM@$fC z0nTFSzVuPCJ=YsDe`%beHJAF4I17xyr*msBK{Ym49I=l9tW%f%X zjybs7hFC(g@W~SkyyWCRH}}5*qL03M{Ksoon;uFaRH2LwVc2x(CaMuPhe| zYlo1Z?dzo&TDC-d`K(PRVG*IHJP(=)yt^@L)rR1RL^v#n`ZiHRYBA71mY2*51KA&9 zH^X*I&W^NHpH~vJU>*sW7(`eM^~b&JucC8kh$-tY+`6`0xlk#qVI+YG*wYjrbc^`J z@1REWFlKss%x*(1-JjP=QtgqY9k)K0)4I(6n|(vw$7wQ_O|^Id>4z@aZ-Pl5dc?mn zE<$KPrIxg~%lA(uK}j+1NJc^1su5N()Jail_eL){Q}kUUC|kBDq)?F3@KSp+2qdv| z;yp+fqS*X+k=nI%V3Iwa8p0dn)cvLlH zgnn}lijRoZA{XF-f|(JDjWmZM9v?vc$BJbJx9; zG=+5{YzVr8la1YMEIIqm*zIW-SmQSRua?2&;%X)a0D$(T?bK}rl7LuDzLqE%=lhGK z(xnTogw4>gi14oni-$4sv$DXN1t%gNg0ZOXRfM+euQRIauqJr4UL zyKwfEP;64Yo=FD9{-|sSPKf`$3a;mv7@@MF)2Bz>+pI0LA(N?lHw)bOpK+OOO3IaT zgZ$t5+DzYjS5E(IEo|E`%*;E)ciR-ayV&VpCi zcV29=g<3pH-Lrfr@=`-HmZc6Nk(omDi0|jkR9j3Li}TZT9`dXZ=+qb3c(?p2%2qMW z2t|4pM-f=<;7yjTH^q>zxM$Z=29S%$-h6M7?1y*8qRiKAWbTwnpfDl0--3IkN(2;b zTVpNK&3^Pqf^?cK1}Y-Cwb4RLHRxT#`ELl6ueG|9@uDkT04{IvZeAaF^1~N`N?9PD3%dW;or_hfLSTg#vH6PWe z0CI&?LSp5F)z27Xa!UF6xNvd$i@^v9nu45aDvDw;lFSe8gX_~Zh4509tW7oJG)8Y# z6@7()=pm{-AwwIFClWa^ygr0ojH~ zfJ-j224i9}9}j1aN>_+epBnb>QZ$SPQsw17kayfXEJ!-57h((18bj#X&vEeI=%WS5bO0@tXG)#DIK>_ zQCdG3BNfw$h#YON^+z&1vN;PIIpx{Di$hzmZjRX{``4TAdVMVyzIy!5K}DUS>K-yi z0q^D#_V%0ywGT8zed~jHG9Pi7J4>`Jk>b){)!$z*y^cJ zb_nT}SoBIN2%>xOTkrWIrHvPvz@`^!Esd+}@bKZCO&}c#ci#d`0QMQ7_%(ZCPe|3A zx+gL{Ywc?C4kn~wz$}SX9gLKNm1FKt%}VOU{~=a$jEsjcQ>pg24b=0WaSknzPUlzF zFE`rJ&q!*nDX?;UCIA;2{%)DiUoCzYQG#*Db7DW9@QG_D#9+mB+sCA*$vH_WQdRiN z<4~J?%I;$?#vY;_ENt8q8p2(s7D!{9ViP&ETU)sF75FhtE!lZCw6ZG+z6c2}<7XUZHJIZRKj8JB$H#*M z7YP1-dY|+k8cKD*MFc5X3OZ*D_z%08!RaON_4&nC#H2uF;UMGsh+->3lH4qrA48X@ z-AJdfT;ru&zKUhg8VWO>AqDjuvFqGBlzibj_z?;FUMe|d!l-fAb-AocfRMdABV^a2 zENAub`g|%AvVhYyJme)Hve{RQ;zb_wY|i10T|B4C8|k8z!U?x}#Mc_F+=6ALsTk|I zm#fdFD6`N(Xlgb`fFy30U&yB)h*~=ZW^|d1|W0f|_ zW?R+UhC>6VYw~IoANpAp9nYBxo2|7|a6i4kbTq$v3vq>8p*@3`S2H&FATdzb%=x{Z z>6@bRc>ozUhbna3A};&cJ~%{x*n%@uxTr3~ffJWYn!+imjH(oW(Fcrk5*FNzQniuNnsy6_%5t4{6>?GKUfYM_}@o>8x7j;91_Bh(J5_vc~9 zbPRf*?@M}6-Jd8`v&MlJ(@g*{yse_N-1YViKHg2?cJ>8k)wi^Q^*539=NG5XFd#he zlS}=F=ZYglLOaHvVD4p{-y1^>8ZVNwG+#|j%}AgPOwxIwkD_$g$tLaKIz(?(VV0Jo zdyM=sL$1@~=1Fao*bDVSu|?0$wMah2z)n4uDK7U;)kZ|buy!NA^KRtnTl+f@&5Oup z(Ja%4fpV? zl}{J$I;sX$diKcwt~@{f6XA^zqZ)#KxEejeyQ&2$oh*F#@;rE$DkK?xhf7(TMQ0Jl}dEv z5NB7Pj${xXVd#nEo)CR&`qhe>ba#hERTp-J$)mBhmE_+r?s}M&uLRySO6)p%#$Pgs z82h3(3U|Tli4t8wQm{rr>;6Lt@i+C_@8|IhEnA6Y#%Xnym-iRBIFFfgjPut&6s zmQ#jmNUBL9e!qC=8fRv@KLLnVJuFxCHMNzx$S~O2LfU4Cb8%q(SL=mmbrOxDIk6%Y zJ>*FH+LMrh2#G(<#A`$CNn}tDe=;1^s`qRUMb%dn#fQbM3mAmKqE=PtZAB~7*6PiG znc=Fg1=lNH!v3Ov6-@VEe*x^Np~UR76(+i}2`R=#Ct!|!mS`rsnlLOvHc+yXqa=!& zh#kA)wWpR7+s2eTo9M8T&^vaVh77X12nizY9OXoh4+0}@F{HEGD!38l1kud&j`CvR zM-DknJ9bmpn^LokgfMiY$!e2C7v~c$HYZM(k_iP%WikxEFT1rKg@v*b_!E-M7Pm%M z#;2hdQU3jenap-c{XG8#%GmjW`pU8Lz1Tila~8$Ig_{Xnjk?7_YxWC?AIxR)4_7&a zsl(*e{gJF{R5LlZ*1S%Aq`|XGb^xRDtgzHENJiZt9-S)v3TYJM;gbWvt2ELZ4o=if z;yfEO%Rn&BER85x1u#L?vV(EY4l7vtKPBT7Dn49Ck30-679NZI#m{EOoQQR!s5cF z?DGej(uXc9HtqJ1I3%Cfvvt_8gLD{g=OkvcrB!FMxbX~>zRvUR9nkA<&=I1RCs<8* z4BIfu!EkUhV~qJ8d)Agi1(AkXI~*J3;GvA!)Sl14KA?WpvCdveuKGhI}j1IkV9jw?TmM>hUBLA(9#{pf6{;I2kv z7F>+%x4!p$-Gv+lJCXNeKSGKg_3iUp-$e|&6u_kt;alHt9Vb@KbHD7_BTDv8!Mh8GfgV8W?eqSKw6oVZA~^g)Ur&IWt#n&D-fJR{~M^`XLlsx+~`U26N2Up z{otBISeFK(hFM0^+Bj=2i|!pDN|^|o&7{5}75`$kV)Z?u`0eu{PlT3LD`!4injCOi z`Dd=|{QU=GDbg8|&%B+EB+2H{A;<3|!d&KrxfOTR49wan(Z)Q5d2O10FZ}V*w6%=~ zO9iwXzUi4%DRkPLauHp|WF+ZhWZ3(%tgMA<3jnl`-&V$ftUepbV<*Gbifh)30{6JV z3F|NW$pgg7Eoq8VI!`Q939o*)V|&Q2mT6>i!&B>vNBEGN^InE)9lnUP zPE~8X;ju>PcllOb7~%4?-=Mlp;)PC-)eOz^#PecxMYc%*#zolJ)0Ue-kYEZwKY2=g zx^9=|d%{&avYVhr63K5iFz0TMB@KWa)(=km3L-=6J0bn-)o0szF)%v1b6};7Z-Od~ zhRzaxZCn(N8>B@$G9B9i`1E0hysgZJw+OmS74tp2kT0Ygz8 z{?i#_vG9=(Ts?9ctN@FAqJPXk>1_Cb@&{HD1bfKcuUw zgE%ahs18e=OypCNw~nu zU1)lKKU9cT`ZRuAW1}DWW0WkbiVif*=_chWfX`^pRpgd3Rsf?aLz11c+l}~5@QQO4 zQ)z!bJ1hju>+m}ME05?CtV9#2&qOTMqIb}g%1XQ0W;#&$C42X#oJC6*GNtvR#k6yBYnIi;23Su zWw8|4YS9F<7yfCcr2no65D-Lifmw)~*g|O)>U>K-F>9vjdHA0F)~LW5n7$(Wx9@KW zAJpOb93}$a@rf!AjJFYmO0aHZAW9G!jR1wv92csypSQ$eQjY~EsDm;T?{%;+;YB9_ zU{;7T!CL(7(k5(}6wwe9{+N?DB5_BRnT?JobI6=vVsgsZj8c##yS1o&>%A#2puV&v z>88Gqp&@_(I@c}HZXV*}B)`0CHk(8X$sLi+_HukCOu*J?!S$Rz~Er} z=9yC+h|p;-Z#t2{+e&c2hwi%g|23}vx%~HEAne~@D8vd5m}M{%6>NY5XZS6O5uwY9 zAlB#7%Zk93S;p$!cuPZ%;Oh(M6z9_s5+NC697&o}i|o_>J8g`FpV#x2YA*FM%X=K` zldk}`1H1d2>PRI)t=AV6gFxWbUg4pn-2Pf%Alpg*ioo}|7-4=G7)EP6PG zxoJ2y|A_3?)>7a|m=s6jMp6LuOd+eSfPN!;IaX&1VFau%E+9y&=L-RUCxT~gy2+i*6qMtB#fnw3S&3z1 zq|De)WKSqOT;)a8wdI2xd3|DA@~t$ zBM=kS$Wp^{3X-b30802WQKSyNP~COg5bVl@xkHN_xT>c-ApU z8Y+eK>P5=L=`dHcYE@*TWP2FgW#N?;sLUgBB}%x3jt^z|Th=PK?(ubyV~{Z^7>^vx z>6wC*I4`ah!EABkyC;PlbEX=H@o8o)OWqluKBsHhsI9!=V^py{ib`p?)0gy5t|fQI zW>&)fzYw7=k+~R7LD5bd-&Oo@`=UPy3j5z*T>v-|Kv#>(h^pvo!iV64}{j=7}sBkY-ZW9G2JW-GB9Q)$Rf&^48)?B zMV_-sEJmTyPuJnL!s={QL;e2#dP7Qd;Gpc&f@$=cLO}nnMZqvo(R0Mx=2T|Wh?yQl z&m-Im-Rc>8w(e2}DHYr~=y1?LN?w)LZw<9lOhiGiq!E!11bp8KD;?e9z4 z+3&x5aj?oMAaKwcedi?tftZKnqQ<9E7M|b}W+u>P=@1-I{|(*$ecJuWo&pN2i?tCt z&*c6LQRLNCDoGKcW!dm5R!@-fO2};x?T*dlw9nOi?tEZjm~9fF7XT~Vso3ojz(~ca zHnY|GTH``*E2m`f+DTp$kj*a_SiAN9rCY)$(I-(_*C**C_Q%4)Kz#938&D=$l?#MQm=|@5C(0`?@GUiP}Q8Qu-7RWib`hUp>{BM>;>)cf7P`JV%*S z@FaL@xc5Des7q*m)n9p8f12EUTz%RMJyy=`X-)&J?-|h^pI`c;i#XEZn{TiL_=UL_ zq%_#HD0!;a9FtyB3X}M?zc>5GE&tQ={gp;k+%nODyQaokF=)D>agkDoSyjN}E%eZ!sPPivXTUaA<=&vd2oL} zu!KAz#OD)#kSc(?ymrx@Q4V>V#dDoZOp^o8c_#GuX=mq7NTEneP|5%*bc*w&y}Fvl z?u`S#2&Ouhn~^4AFT^wa+>mt1TE}HVg@qA)tjf5zz_-PkCo}q(^9J|?z8Q~w2l~?x zK>zbjmQ0OHf0F?1GMJ3W!;XZx|KUVq6m2;CiM}@ACrFdYyR4-NHX&&vS>3Ip#{075 zGMq!5;#EU{GXi`JzD#+#eQPEpEW=DrS^`y@Y>tGl^9 z!oRi;1Ot#|dZj$;b+u7VWgLyg{HF?<7zyc2Bo>VjOx|XG*oy|S0t*9Fcx=NkNi>n! zHNT7wWqHOpc^R2`$)$Oni-^Gei>wP-u+hmrx>lxzDXyniakFYCt}LR67pY9SlO)%9 zHAI`_l-FxA%j~@{+3Y^9Oih2sVfEsgsHlS@sw%I9eqBL9d-FCF3iGLwCMojX!zj@x z%?uDWojIAuJ5g%?X1z|jY&e0N%M~*iv*Rhh$xVvqadYY2^@M#_Hf%>;)(-}*5c|+@`FJ5!$kvhkuX64!A%dW@6qXw^zI&+| zyw$};Y4-`Q0`06BU{Q9~j_~`k86kRjAanA~V}g#IR$OwkUHcU>Eaf<;KOW zC}GJLVPXJ@x&FLunIkYy9aO7xYH@wX-k0knmGAXR$M>Re>(sPCUL5K1nE9}p6YqXw zkzal$YjOAPD*5dQQx4q?ESpYJ`5FXR;^D$U2zEx?4@-6B`DG=+kR0~x5 z0%%E;^9GX7*aRx2#Sdl6N;;Kfi#u z9x6Z&c{WLJG2EVxKK%p-+`(;sY5sA`<;=`^1PNm0tK4lUsA25`LWbwqIhH&m@Yaqz zFi)qj;(4RlaIaOm1!|}cmP1qRQNR;m=le(@4=r_J9N71u&cKz%ZdZR!>}`?XnA5y( z^ahNfWdZ6kylb?yq~09ftdB@2Q|UO^%{{d4CJ+ndp1)x04aLY^f7!?He%8G$ z4(*Ur_Q+d^EIMG}V)=N2^q+ALKxAZMSm(W~d(#c>1v48Dw?5|NZ_LS+z|-4EzuL9( z<mWrO|pI!ZEctl}ihmdjD_44lq6 z4a5ekrn{n+Z;WB)s0c9#BbHYT;?VG)f&lEPq@F{tR|~ z%z%Qo`)_U&zZV5Cqkqgya8sp5w}QX5t0&Mi!?RYJ*l1exZrsb7{!3lX}y<%Qc*~h?l~|eQ?|YIn3AArP<9vUX_Q6 zCirLM%MRJC_bY$^P28d5?y3#_m3|?)&xYE?_wNPjlI(Gv!qj4& z&{(YFDYY$~n7?qg_AcpeJ=}vyz-)DEO*ZsGeIe|4yrXr2=HhZ`0UkEFD8=M{itOi4 z1K#B*!LW?t8hTUK`0VEwt(S;tpuxzU7?RhN`N7rkRI(b~5IyVdBG6{F?a(NN)TLJW z?pNM;3ANaW>xNKCA7OGjLH>*-DC#a-I4~&otbO^TN}1NQdIt{>x8i7UFWJ%k>Ct5_ z5f7-Ap2GM`f%u2r-cvX+_T#T9=Wj81@SbRYL=ytSb5(R|W4dEaaYfhUO{Tk|PKVHQ zsPt>*ZF3wtuV<;dtYlvj3YNm>m~>I(W-xm3XyNSPy!fy{p6?*XV6n|%W5si&Fq*L$ z5BX=d8qp!I<7Eoug7TaC+7IKF2fU5d=}mLqIpVYJ1n0MuOmc;7c@6{dEQ9E5-Ezb{ zNiyy!t8F?2ZTaMOEg-^@D?V}l(Pki4|!{#_79b?eOO zG;f#~hQvayid7v6Eo0cK>V;K4|B148?cTiI)XF|giL}%11-ZQblctVCC{K<>0reR( zwOR=<=3b7&@{XO#*f7wGvh?r>Qa{{i9lz{=?<>#i}I?DXb$}MJxYS@n|g6 z{kx&;TqaLg^4^{8&Vw9C_;08)N@1%dLYs0KVXaVc)|ou&L@T0A0lWPR?< zJ4C;Qi0)>E(zo|w=eb~m$0ekU64orTnwq7Ue+vKA)IgiHfrN0Ypk$bLgva}N)y8sc z>#NmH)p&f~%uC-$6^%}fxn;DG;TCVOUzmu9d<4n%@M$%SJm75iQ`1I?J#VqR^2@*# z%w3o5=-A;GqlBA`3Zu27h77~KsmZ+pkK4o>)kUusrHeTv2l^F#dxN1x3Cav`%@VU# zkDX&J3n8HuiH?P08J7uZOS`V}I?_(sWm6VG9Qvy?M%~OnhiNUoAiG;AKU1eU>0UO2+_^X|eKiB|cuu^WmT%74HD< zJ@9CG6tx}e=T~colqTrX@Q4sCbcPg}(G^kn3|{g>?v_F+wEHuzK*Q*ICXa+nCAq2z zAnhB!e4_dW1kV$6LW$%i^IWysvLkA^ODD#${ zR#8F@RM^mI)NpRaPOFZWueQ?&wYLC@3*o3;b1Nv?CT-Y{(%`o%gArKu6+f#biTJZzy@q(caTliFL^w7+3W*Gx5ht`I2f!Bxp{fmt4lSIqFN6$cabm%)QJ zk^-fLEC8Q5tBu(Sq zACwEn?xQHX7` zU!1;%l**UrDK8~Ty2&~4EteLn)}QKJsnfnXe%(~E+iO_dl1?(?jqX_lZt3v({%0{# z&}E-Jrhj!2iI7C+Z_q1AxwKFaX=CuEeX}oxO%hBhv;Z&9%3FryrsC(Vpn7+O38bl;;1-WC>dSZ*5-Wx zXnK~%h8yS-X?~w%NCoLHo$?5dx7!)0ZfuLbbOm_%aZ~5{Y%6n3CD9<)#XElu-fNE{ z>6gC_T{MUu&q?Iz+dwPT?lGi3Q4Ss&57sA6N|x75+#fbB9?}P|L#yK|1n@U%MhWxg z!>39I{$cz#c{^%y8q)h*pL*2P>?4Xk`?^haHBuZlO?wr#HqIrC+Q-?54bM%%>#t4a zu5}2Hx<47%(XKgZa3zm$_c?U)g(R-9%&|J}1^~wS>O(m~vn&mAjLph!aPSWoL67!( zk1eWfMNR@%7^Vd~$=Fw=j}j-vML7j8N;a4I$en zC&$3%jH89aK4l&s?LF(&DlN-CQ)kU@vHUgE6k8YCRZW}M(zy#FI253o`YQH*EtmZF zCgyC3jP>V>l$_|KrjMaF%pp(2Ei9<-!Q=GWrUf=$Ptum6yQ@weYigX$xHD-2JGUL? zPNx+rJWf(6+~~~crVwYc%Or1xSqyEmGyIhw0=qJvt8AfSlqp3qj-Ep{K1)trEF%`i zc?yiJsv$1YOQf|C1ZQj2Vq2)u49AfKHYKy${4&yI=M(J|j0TlsB@NPvd$<-P^wr~V zLHg5?Fs^V~d1jeDTdwLdjc+6Zo{5Lz5ERDQv9Joci33IjL2m`I)W@^!M)!KjNGiwU zOsshFZ*3FZPPpTci>@8pZDN7`2vc-kWjW|7Yty-FeCS#{FUUV6SBTewQ!bty47J%q z6Uum3t|D@bC8*+}PT_g4@yc_G;^D5B5p0UmIh=RvjXXS&ycQD)8u;hj_dDxAd21>q zpT5?8T_zn~h4st>UGB`iA7pKr0fIwcuBR&$R_!(XVDxLcVob{~-0-{A%Iuk-WLdC%&evzI|#Qrh14A^shN)}e%%(8~C=bA_sO_4Iv=l@ES? zqiXCgVfbgKJmC+=%A`GwF-f!XsR@SFQ5_jA5v}j%tgSSsxtW;x<+ymm{c0ENV7b~# z6R!q!4d4?hW2gMTeJ{s3ikx!g$13Mi&~LL0OXBvc>OKDXZf`NpB{z1bAOYwN_+>LX zon*pPnSR5D8tdt)<)Xl0Zlt{6q#qHYKa%s9pSzt#cc3cm|M)>n`#IOT2-Sm`{qK%>cQ|S>) ziJ$Sfr*-ktixs{`T7V3rSG1vDy4$3F*g-KV{({Cz{&|K{Me(5R$&LItFQ2>@wAo?7(%X2c8SH@KiO&lCK^?|0(&xB2>PqI!{jH5`+jtkk z4V2wq$ad*fi*I~Ly@Ph)xOwyo-6p1%P0DKfaJ9ChY_oi`SR@uqSGo5YzUen zv5H=-dKAVZroD;>;m1pEmmJ0s2nt%bI`aFzsB;!0VQd|Ufme4Oz~(sLD+%FmKMvx` zNG!v{SyS__IL!&Dv%n<|0VMS*dUZNV|IyH1qJ?uqr>^kT#zAJ{{PD~?mR!nZg?|m? zyrz+~SyEFZRc3lH|1(CBfxe2S+X78vpfL^I6qO&zk%hhP3F3e{Od0v?p}8^@Zdrle zYf}RBesPDY#DHq&gM9#99jdb8~%} zS^L+vP)xZ^g4t-^P1D1+o*5f>)gGA zhU&Kk3?e`(w$~25SxY_Ukk*+k4Du(QuET(rr zv(p_LUQkruzFzwGHzn$)GanyfF`|e6ya-@IE2Bg5#O1grB`u#)oGSt=@N8+41T8Y;{Y!g>9j&oB!2plsXg8&{(3p_fK z8K6EB@4mBOP`T_Y%6?Hnal%w#W>mVh*kcb&N>P6i@U)UpFe;UB(YbQ<&1^hAnj708 z69xxjq)N7_b)Z0L_VDu$beFVVqjb)ow8f-R4V3DuM8>3YOp&=7571T*(K+@Be{ z9Rk8C|FEAs51yheOSblr1#oD*Tq1U`(va!C79pl%!ODbdYoYu{u}=Ok%MHH#J||=hy_9 z>TN+GqH(cWGLfHGAdE|;q6@iNOTHv^BDNTPigTlwm0?IX85p_YfQB~YF2;!T-z+#2 zGh`HYD5UlAI&OeR&oDPGf^OTN_U|tFI{w#7=vb(sjecl&cqk(#2Q8pL>^CO9a-3WI zu+O^1h0*XdSbBGb;Q_D1_Ax}K7i?73xUu_RKf9&bRh zv72Tq;ph6yc(B_|QGDr4>(%S70$Y1+1%?)Tx^xc(qAn)*! z!#V}}i02fgvjC>Xd*D(M7oeqbBDiqWY$n;WXN6Wwjk8Tl4T1UloFyBIP0&bV_C+=l z`sA}$Iq%Y9obQ23sR>L|^iQ$`OASNj5l6qQ107*ljGpa=mIlF+rtjN6Oxr>&7-2gK z^j!M$jR)FQfRodzaefAyI6PD_c0c&u<|TByLyIB$8nr5GLOx`s8Tu$JYjFwB!GrOI zeJg4Qw)OM7m<3CP`>3r+H!}ICU={bax-tMS&1hgn37UKFP{T zUEYO-m)vm`#ab1EtxA+87kWWsGzLC}VM*eO$k$UcNl4)91&_h2vzv}BGP7s%#MFgQ zHN*qz-_SXE&K?4v)bdsSa9GRj5!>mP^R=3UfwvH*X^RA>r=lYx^zHn;S>G=O62?r1 z-#ol!J`=Sc)<{REO5`OaA>*}^@o-G6d3H6~7{o>#cbS1BHwNIGw`Kbke?oBa5c)`x ztIfh)pLXogCYn>r2DYQ!zPR4k-}_9KcGdS( z8}P>Ewpy6+k+ZmYE4Sy0ev^VyqtsA_8LMJpt#M3?%V%GkI>4ujW0w|mkEnD0AX_Gx zZaRJ_RKaytoSCpxcIz#*5<6h+Owpo4Ro~Qc`Y8?ItA6Pr*J}Gc@401AK7ul`Hpx8T z=xTOtFe=*G4ny`#C~ZVTnLe3Q{G%YBTqUp}MaKAayE0MKInMl&y+uvN1yK`6K&6VO z>zpHg3s-YJ#0dxPKBDqbssiAO9sLK9{8 zh5Of{fle-69ex*$1$_Bv@zt`@+xnEoMRPgs>*NXd$z1lrPfqGiS0z0@D$pJ>hS0Mm z@{S3MuQIVj)$C4iV(}u(C%pTtusnVkcZ$95k(!}JG!o9-) zGy@c-d?7X>469L(J^?S}gLQ9@0dr`-2uguokO9O&bvMppyF!Vhk~61$G>XRI29^xWN(nj=$@5I+#ve)k=0BSHB!zu# z7=`<-+)&9kCg^(y<-B|^ELVMP*y$OR9WOM?ZSxuXfr&EYA0)$BC*RYQ6V@Q^Zryh$ z-oK2Du+~>TDHhvi#Xz-0x^6+i=b*@Fbzq*^=$Y#FkhesTGt91bLkk<=i9-9bug1-> zob+V0DQFe}_@!teU%IxtKn}287|_#il-kl_q6kqxMgr6h-(bL_(s^kl;S=n#E7Xml zIl%mqeEnJI7yGI_gc!vgkDV|~;_7IBAnU>D>1L6w~HOGn03q!yfEehZ>l$ zOHGEQe{wl}U^L&s85bk(7*Mn`GFtAs$RBY7eMm`*Vqw0$bNGaG_90c`2FYggxU=xX zVAHZ`#f2squiuLaiA`5~R8cpZ+0Im#C1k~Ev>1Q1(@m|fw^(ANN*0gxfqxu6OdLJI z(}8AE%mpNQoqw7$h;MPoXaNNQj7h({9On~z|Hnprp8KJL6EYT`&OuFEQm8MXjc>y1 z7!)=5b5AM&8-X2*X}Kry;kKy2ahd7OenO%6#gtZWF-9pd_rx}Df8`@tJz<3F(=d}hD? zllIZoZPJtz+AATM)WHEh67Vaa_J^l&bkCXvtbE+s6GP_D@MTeA7f6QVM+Jwgfr$Uw zgC7Jz1gFY63D2&#FOht5Q#?-RkKKXW$PmwI8#UwS0V(Igz$x1p!tTTa^rH(yM*cOw z|JTp_c2`hE6`YvCs)FTrQugHEWC-{KKi~I1UlAu)%TMrB#ZNNhX-SJ#_=;f&{Xhb~ zJ`)c*s^iZf{{6G^9$X^zZ+n2P%|A2-m-}VHf)8fm{hK2KpWP?-dVP-Mvn?sWLBPK1 zQTTB%7UJc72UB;n@XhDV5iH`~K|uxyGXzB0kAGree}D1{0t^!W_rvf1JP`kTKQL76 zC<7b1583alPJCv6=bir3fbahM0KX6V;QzOye=fR$_*Yy0`;|JN? zzKVapz>oB=cKG)*7{=k>{KCHn{{MIcnWy``3=+scci10(p^ox$X^9_k%wfzCX}>u{ zV|oYPH*!je4IXeLrgvJA!(c)<03AAomoimch5vWOL82vKna<4?-5o;3+ zoKys%Zd5!P4RAaKeG|)lm*y_lf@{MkL9vzCd`2sI!Ak7!!!pyE_w)Gu`?YBDoNueH za|0t+q+1J6XllwnI+DtDsc@sm$|obLd`pR13=qHX_aDK$LbXh$0-brM=`seOIkmZT z^Jc#Yf#;H2Fk)VjwF%TBWKfZ=wfX%U9ej;}Xlv#)Q~{VM|2kwuNn~L27pa-g6%}0_9pq%K54xSgdynZKSJoEE#Sq}^Wem8BV+Q*VrfVW%aK?}F5u^woZPkw2!$Q*mmFA7;QmKY zt~l!&N<>Ls+m*k5+vmLwQAc5bgHhqWwPRGRfV5PEo+1TZCCJqD`kz#Gq9 zl8s?_ZQG2Z-IDBhay%CZ=bP&8zJYjn_rDV*`QP+(#dB7Bt7@${@H>VTc%>xx{}%oI z3^aC-{OkjODd8T%T618p#7v+c_XyBSA~+h&Onv}vGP`>ja>PKgfD@)X{F9CucRDEM&9O#{ z4IycjL!wCa)X~LGZldfe9jXajx2Gb#XxkZX!+9dhpcT*auvUc0>_XB9KwjbCS_e3X z%8WQ0RP}ZosI^&4$I${#+wzpW<0)XsVL3=&zEz2WaVTcOD8V=5Al1~A2$?|xR2ilf zdG5z$v3evBc{96f?(flY7J7}2WfhJM2fE5T5n^c3?3?b4KyZzjg9RzMn$A+#%{W*9(`Cd@bYKWv_tc#hgJqS>1ugfb@*%y zN1E~~<8rCN7wOhbqKP!|Ok28k<|U+NRcF%3G!gV?BX6p(hi)GiG&lo$NJ2@Kif5+3 zJ{TpLA9w=N52HZEVI{|hW`!z>F;4k6dg+=LRyc>{HU0ix*6mNcIL>dO+PeFtZsFEq z$mBX&TE*>|I5-Cu+(I-Q({J2(w`(h_aK~;jojyQ3u>8*u;UrOz=sF;KTBPi4e~L>^ zPjf$x@%!0id@PS>QhzQB&Sq)aU1{P|en(_-x`%0#h%`F8A|`#Qx}r%Ud4rg%`Xl@E zq=j@?ET=dsI8vgo0yR>@?6QeKEOD){-R~;AH?bhnE+Ok!9pv_WwLQ=O>F{n=Tpu)V zEZ>?WcdXG*r?<<2+hG-c3RbZ8>cyK4H*j{oq=1-M4|E~bIhD?12&r7Jn2mWWE;5B zr5tK%0Z%%O4Ws6VvXC4XEt?_Odegw!5eg1X6A;8sDh_Vf-_6Vw{89zw8x+;CS?4mtU-Tw4sq02X2c}Q}pKaKDu6)71kzvlh>|ck#@r+ zlYA0}y@Ns|-YV86Yl@qvN=4;-cihfG)hS(H-$>3EvaN-N^eZB+aV&$CMqc0GZ8@~^ z3wR7VJ-dSPgioRR#Lp{9FpX|dLgYFqXKDT1xgdW6*3 zg{)1#^E6Gd;5<#oupnM8Z;6?)*Tr7>N)cGyXe-B4r&@`JC>?7j0W~fwLY@-+tES}& z=F6liTIo1y6&igP&GaTNnxZA*D2L~<)u|Uh61R`~p$RZ^?PktL;&fqPMyAZVQk;yq z*~ue9tY7@sLfBV{)p>FMjL&T7}DU@%n3 zYCRW})Nxu<07~KyCx0)&fou46SABTis9@W+G(&635Xqm9K8Q!}i^))V)px{vZLZ(` z#x4TcCE9;d9xM9ZLH~!gw~VS|i@F6#2qb6{GVp_dKLS&5YhsI<_Mtv7 zd#Q~4OvFg;kZ5Q?J3vMn8NNf?=UAh4{VpYCIN?zXB*( z`C@}gxN~JDsTyRYQd?NSIl>rTVOc#rzZ7)WKgi9DTme_e%kRi9z{(R~6c<6jjME{%>{r>P@=_!Ycu zq{Q(Ml^ewmyX?LvDvRAaG1tMZG3Q-yG`PJ&<+ARU;@u8acO`^|NXy#$T=0!S;ty} zwkKJQ7aF+UU`Zrm}6qx}@bEv@rXy-nwh|=`*fbwCxUfUPb&+ zOY4&WDht)hLdYF1@D93!S1~tD+fu^-`J6Y0v#TJN5$-6BqjiX5f3M2{&iMkk7ylWQ zPgYD+G>5MOEY@>6Uwt~YIdfy8KOsm8JeIn*FHXne!{edqWT>`#8_1#N2x8`$giHS# zlur?oi6qctM-5JN;({2a?^8E_90C^)(JU_DA}k%740RCO1&zw8TqaQjQ8U2*nhrJNXq{{>~Ie(_E;?4bgQ6Q*P$+PY%iLa#1x6oh*uwa3ntGT$yv) zG&sYV?YWeG7uWc>7Baa9NGDtA!}Bl97w;4KIf@i7Lka`lZ{6=unW!|IX#Q*!$5Zj8 zwz5u5cC0v6lvy3yDZx0MK~Y)kXme6|hp}-a7By9=ji^tmx;EQ8Vw6aO!H|{z6vGb) zqy7`dJ@iSZvW&azuCM4Kn|+Z7Pw9+48io`)Z8ddP$w%(DS9Y@KEi|_?)sI}$Au6k3 z)V=$zdm=~&9%ISe6DVqFC@a=H6s!dPH{;s(%@ z&l(k0)`np)`1W)fBFU3tN4_qISAj8z3dddC(0@R^4Y(&Y|6$4$zu4=L+d3K=Kvo0s z$}EarB!aY%w&Ol?R#Dfk-1AVPQoaebr_z&OjX)P4%%kP)KQXY!W9Q6;_x|V76Lv?% z>%FXAkV{D8&60Kpl1v0FRB?%p0ZoWR{shtO3}kmF@!@;Fk#hP)nGPhomm68=BC;Mo z2?qpBbZI*U5JV#QlTonfyIfi5KBA2eB;%?W`=A2Pf3O9pTp9rgi*~s}8JAfR*){ zFOT^p6${K+LnCP)7!R@h4F9V z$hq1y`$Gs}QRqhyhH`T);g~0!KJi+Xu1`P$A-b7muzD7)2&Aw^^n1jz{1XTXi1+nq zGUM_77xdqu?BEeq`kBhjvwd5!@PF(}jf{%KqY#eXh@Ux@=Z$nx(BM}tdYRNJo-d~y zZtkmvoa>#IeAp#0Vkh|xJ?>(24?EZiS5P&QT8W@vbgCCPPA<-v;DT{0{hNfRa}^+$ zrXZ#Yyj}keutiwI9I-><{&KomI$2pA_xm$Rr-YJjP9XBsRu7b9L|4qgqG0-McFa_!sK!Tp&@Ad9r?NPKinNyEm?n|bNhdZVbJyr1Cw#4E@Ee|CgDW7{;Ub9*6(w$-IXlJZ&oa0pe0eh01I_Ky+ESMmUnJT=aGVC za`YPntd!yZ*h>KG<~feltbR?GNH2nW3JV#u`@REXIM*0@*XwlqNKI1NymV%|EFXYZ zg`S?7m@{ed82nM^EduP<{NFXMS8J>%e%&QNvB%IjurOF^JigZ^20T%g2d51PA^4-| zi~=%g>v>bO&7>E|`rbj?M<@qAsqRaw0t-0mUsMG6+x_?z%*zBz%YOShD9mOb)kpyd z0R$+HjiAWq37IQ}i?4KJu1xTSGxR-osjewk5=RjlAt}!|NE6+FwIe*iFC=Ldv!&JN z6VMrirMA;tHHbh989yZbwi397dWN{T!z3>5aQXw+!`miTF|wsm<#@iT0SXh&Tfw~# z02oG8Ax?Ey#!fr#WXDFB=XE`46cNNrr@~sqf_din<4GLWaujfHXh81Hed%HMZzZIM zo39CSwAEqF`qz#^5672mbh4r}_U{JI-v*5IQr&IN2=&VFXm(T|hczC{-4smBAU}E~ zz`Mwb(^S7>z5&f7I26j-QG&CO0V~eyKlogF+OOl+Fd!#*miPn6T;j7+#@vfP3(_|6 zbLf426Ss`b$(vYs$9^#pHNtU+gnWPc^J>{$C=nX6fk%p6+mxT!`~{fG-v#~mZqLum zZPh=qyT_7#&ytkMhJBwn#)W*M@z}uH%q-(d81~NALvEjRaOUrE4E|U8JKA{dZ=V0K zLQFeu_roMMAp{!`SZe^}a2;-=gu$Ho(dE@s+N@?TEEqRnFMRzFCR8^$?h5%9TdxgE z90IQ2|1C~tqrivlREJe4zFZ7(0M`Ye+UH9GbBd#)iZz=}DU^S_Y-pPTgm1EQtbj^n zKZq*$Bi-Vv^NrLMa%*_PRD|o5tKT-4{JWn_7A;$*HSfUEMeqas-6y(0Bi^%e*vz*<-f(_M^LZRO!GBVEmpp z4^OziBc&NAF$fHRP2il|oI05Ek(b$++|%P(mU;R)y)IQrNT^R0x^5N>TV0MWgz{@& zFj$8_u^xA6_Xg*~O5#VQJR`29=CtDgVW%5txtByZhA*a&a8L43wuZsXx2^ z1#m}+?{&L0w^~>#AKOcz3NlyE;zW#&?C$PxqPRFT+%{A(icTW2}Z?y>D;{ObB`PfTYmw(u1mOa3IWTKRpA)4j zaMlO#Up?K@d3C^~cyIOs8IwyPI>9xOVqc-9u&%~rIRYSJ*$vo0H;r7Rf5w8r-&x1G z31Y&W_Jfe-S#lrA({mX{xkT6!Bey$Z%ang)T(-VE)jty<{oNbef0QF&=N!YOvqGgX zj@@nP*Y`@!a-8@%9H8w5B@v_9=gAP3IK%sUm(hQq>8RXUDLRo|3R*>Ag0$x1YqBf?MCq^|nB z%Wu{?^&(6PJ_v@patb)fKhmzJNFe>VeF1Jk?e8HL1kRx9mHd^2v2 zegOP;b7ROP^Yg7lC0W)r08cbv6xDcFA}U9ahWNRL7qiW}=`a z?;DvPK4YuSkd`dpQ!+k|13HBUR*Ac>L(Xc4<9g3yLnf9v20O8Lj!rpd{Y#$ZI7(fj zoVx@?EVBfy!N43JgLHywKeFsgkj~2bfsWwAhQvAQXZxREHVF01@(jC^pWAm4vkiIv zlhADYyKD~K-YuiZw(n%ZJh9q%QS7H%lq}95UG-62yxnYJIm01;6n!3;RrDklqwebn z0Ms23f^6%h$$tGt@&V<-qlk^>64c3jyMKez46I?Cw8S8_a1i(QpD z6gV+O`J>HQkjg~jNkA*}sMy5sQcKtZdJC$SF^AxFV(^(-yJcQcoh|Axy}aqQ0TG&v zCUv4=B)>a)j>E`#xb^oEYX2v4K^I$gYpdofj$PPJvV0ML5wd?s4h#ifSA&qX?7D?k z&!O=n*rat{x3Ddi$EEXURm$A3lv@0=+w(@LlS`3qrb`ddWM!d9FgL) z1o=~JQj(#gqfW==e5zib7k>*IG{QW7UgtbN<7cS2;{fu98)S`}n6ySR)M?67r9>bK z${A$-oIGbNqO#v8_bO)IbkAgm>-4VQf6zq^&bD^8 zywQf;gZ)Y{W}n!gg>F<~(4cs)@w_}rO;VL#AlcE>5I&5c@yFjC8NtF+HO5%s2niV zxt|CTA?jK&sI-p)lGv_6-+=0!kkB_)wRF0H%oR~Z>dUi40=c5E<>K7eu?KQBdo?VTLZ(bYHM2V(JM6vYa-7I-+a}{g^E6wD@_U z&piH=_?(|62A73*4meeQ)y#u@<5Ke@501rfiq?cTY76Rnlno<?W3I7pF~!@x22kpL5zcYo=6*f&5V znIby+9nJR2l^Vl<58p7#BeGdy8?ql9;Y^KpX+U5^lwKoCG96_3q}o z)n{YXFu%e74qiO_+~mS0JF=-52}ohx%*Pz#QgHNOVpgbE6ZHRIs0u~q|6iyIv)@$3 zJ`17%sIRNJW&cLDg^;2ZZYW+AkFA~|c208wJLpq1=C=#$yeaFN=|CUdC&>$)rD7Ge zDQ=6k>u$3>iD*LA?@a5}3$%fRBhcSa3eZ*H=&GeRR&3TnECh@Z!P2jI|0ENTfj~&W zsfI`pf7$dgjYw!_gU zrZjTOTArc&q;K&-MBYsDE>jE=3;mk|AQIb4nTu7dSI0MaY#2%HH+;#E`tj4oSKzIe zCKS4HDL^ran_aG+2DrP1^c##6(#CE`*5B0{EJ;l9gL2m`GQ6ma;*Hm~K@Ye!iXr^b zLQfkY*!I7R(%(Vki)hKs+X=9}NIrK*me+=c(NW)&Yd>Je|u+4i5@sOG!?} zK$vJ+KX=s1b?;z5w-IQ9_-Qa}3R3yLm52!p;HXKv97TXvp!-sl1MkJDqx1Hmn=NA^ z2ERqp_`J%v%;-J_ONUJD-sOne&}70~(*BNd6my_3KtG@{HrOJTga*-}oT8k$0+72* z7RRJ%KT0Msm&_)e5J$>YMlBV=$R~@Fe#Gr!#iLup*EB%{w~IB=Swl`r!A_4LEoS~y z@DX?lJ^e-bVVR5g;+hUFS&_4`AK7bviq(Y6URk?7=a@kHLmEXS2!eC6)Eh|I2O5m8 zG?G`w(`M?-$7nn}IOnTZMxazQ1IDU1N4|D`X!bCe(lqNQN({ch;hPyKDKSWb4DA>r zFH3+szLBKo>Gs`!+jLO3AAb=?BieNY5j*H}wpf00w_o=%D&+^kp|f^=bNmR_=JD}K zGRBS;YmE1`O)fhfBT1={Ng=l0W-W&|R);8Mz*|Sx=xW_IN|}HrK72pY{}&d8)G+A9 zbl{EigY#s*M2Mc!Ospbii^vgXTp)g3l+f*h+)*cfaM2Wl+?ew%!0T0MxcT{Qwu~-O z#2YXzus@0|Fi6TY$I8dYT3>l-k_fVx*5-rD+uc#ZeLNsj)fjSUa^~;udTLC4>Ko=p zt%*?$VeTI4B5r9i8!AN4eu8~th{&~hmdV9pe=?tD?4GYA;mfy-x)q76)%85s)~5xZU`(QnbLj{ZF5cxL#{gX-q|J>3Jm<+pZJqHy}tEiZmtY0Dx({w zdF$2d>u_L2ec@+v{W8G#sFoUA5|mt}V~Jzd4>|h1E_dpXr`(^s6qz``;RTZU`|2E- z=*;bzP+HcXH$>P2Z!Bw9Fs{ZRMK%&t=?Z7dYE+PL9!rvH^OS_e6Mcqd5}>D0i9X?` z$X@cYry>S?F)Ug97=fZ3{<|ulfLqr4W_{?*OD4ab|x=sbgz3l%F_txfN$T>S12(>;hE4ghRl+_y) z#_^qB5$4stcLw1V`rOowhJQj+n}Ip=MZ@yVBNZHhakDB@E0K)UT?tXzH=(|s zji-f2k@%TS3X!<5uziL4hDI#T3C9msVQ# z)oo5rxij*2LM$A4#DDJUaEO#f*S=F&p`2hEyA5fRZ*B+o2W2XU9x)ay=) zB2<>pYK^iGv=`rWQ4`QOR5b^X)8m!JbsO!AB+kp>vad3zT|#qq;D-qK{Yh{pg(F|S zqEe<*!HB}8j1&$eSgqPlCp{dH8Vz!6S+Qogi(5O^OzqA^-f&)cCFVOp69R8tgHtZi zk7Ok{bP1ysb{7!NwAgW_w^5`aO)VBrwRP;!tm~5@M$A zJ83|6VD;eOQJbV)HjPC0rKB66*xOe6vzh^!Ay{C-^is#U7IDo^z`dwUH@3D-FtwAA z)g$udVQYfsJ)w*bcwE8@*W$QNC=rcpLGh#S*RTbLZbBX% zSb97l8Y{!IG^|tUY;LZ*Lec7lc7yFO?#tbilVFQP(AGtlrKfxUCfFMo8xR@ogI>V9;}%d&9-Q zPHG678s`Q%kbuqP(-!Jr-MH#H=JM(d)ag<== z$K;?ow9$7m4R6FsdZek2$3lAOBNVYKj`MxtKVbUN;*C9Zr8M|-_$>X|$=pS6+ZWTd zRQnAhvlN=JVkmXXXXeJpZ>@PkLEQDa7hV&?{(7w18VwiTNea3}sW6+FuMXgH^cDyA z^Vi&%2ayjjUZhhw|EC-g7+&Z4S4eQJPmiww^u1?;)99W(B|!SQG!UoUcND@{K&;0v zIs0W&`1ia_b(kq$MT0{ zjho;$Joe9)kN;?+fy4Y2HP7+K?&#yh3K^)3nCGp_hePBMx$}K-kc~)=-qqoxveVT0 zsmIa$k{5V4(=#6dxI_NB3g8DfmEq>COi5KSH4WpSMQ#+t{OgZH0ExGb{Vplqb!V;~ zc0C_JL@)u5$UnPm6OSeqCS@=H3S{6zfKvGry8}Q{n#1*W7*|BjuAn!hWvWZ$Ve$5l zEUw44)YlGPXA@dB%Mf5@f3v4QNBgfH`8ELfu50Sy#Vf;6L@8p=@sRKf8F?5?-+NzW){;IqK70SyRP5`hKpcdtM;xVw_E#zu{FM`^hA4e2+m zR6>q$nrL0`(?Kre&076B&^8nH{al-P|LeEtlBk*_6pj6{udj!aj#!dH?kNBFLzZZ| zn_W97^>>LOOL5+;W6-OEqY~s4@v##o#VwMAortp^ zCiOpj9fRMJSD2P=_nY)Ig?j!R3{0%uGn7vGbY*Ae^2zj|-spwFg`ZKDudkgN|AZVV+N%9`yXc1{YgFMa_5DOk$i>)*6WY z^?k4}iL^%S7n+b{%=Q_<6whDwpnS&mVL#e(J;_Ro=M`vFSL=xV{&J%3)4*4hUo4IX z<#TET`_Z=x^=L`1iGBEf<75*k%6#6^(RN#5C)_bhTvg8$9Fa+%=avWMGwB5TQ8W1| z@b*f9jXvksjPvNxdRmiqy1}sf#nv>u1cn&3$Nlv<$L?~u0aZoiuLFYJ#-9nvJWVY$ z3=Kc;_FEtT8My53{`u~^lzbaty7lYYfn~^_sl(^?D)(9E{(RQwClxFb*J88O1KWaL zubdQ|!Jo#QJKh$wfA>RwIv(c`Sfm23om&jVY+AhYhHt!pMwXBikFsZ8ZUes?(qJ+i z==#xT8TJQ~b*?dvM>r-L1H<39RE1Gz=H2dcQte?`CHUw60OpPK(MNA>(|!xUe~>q% z$0G4DIp^oc$||IbFA<=2t&r>`cw1$O3SKQ`u4sD)KgyA}_+~+Z9e?(gZRW0SjCDsQn)#yKA zNwcwqQiSlME6B@d6l)h*g~E~yJ&|G4(>y)D8N>>|yud$C0_Gw4Z%oboBp2Ijhq9HR zCLua!v`_^BSkq|A1rbU{v5tl~-C&D3A>oMCMw4e}YszYbYIg2B8dqq((_D%e9MhtV zaecC&yyC&{+l(Q^hElSH8qWW+TLK^Z)BZF6RfqPqL2^>-p$jPPVw0bg0 z$ImWrv``4}7OSQ!LXT78mTWegDsb%VSFc0}6(GpQz0KxQNcS=O@5D>J8E6H->E42k z;`}Fk9fd*rOeLm|Kyb0q1z5AeSLp*3+BWDv{r7wq{!?ed-6}(_F$~YMjwvRpvykvU z;9mrqqM)Ok*De|3Hv&y}8shx9Nq$l3qhox=Lo5OoA3CY=DV^_1s$4v-HSEFGHPlsd z{?xXiv>UC5tH~8bHh~>Hhgj}23T>&W6jkoAgtwQDnC)SQ*D2Usu7Or)e#IQv^rfd< zJYyr%TfwGpF*3SZqc9~c7hO_3qBVg7&o4Jtt{1t?Ara)m1d6P(M810V> znz)c{ilikzm4NPilGLt3CZM&y%6`BOAxqy8tzdFaWpW2|Je+C!l)Kh1nfXgEZ`P1j zpgY}^sJ_%aMRj3`^xDY$Vi$PZBYuW+kRUn1;85G#efQ78>pFVN3Sbh&YLVrtp^Bt9f}aNl$~KgcV|w`hC$ z?JM&s*=~fX@I)LTrQ58NB)82j6qF-F=)GtJ0iDK&^?hC|3NvN$-Qx|YW-~!3_Jrx^ zLRs2VvcA6cZTmbJ7aan7kpv=rj)z05dR}D!h%gWV$(kC-FgWx6@eZ>=*|atA@}Hv{ zQ#bc2(BKU*vk287M3CLm25%(Rzcm(a_L9ikaR75P2VKW5T^VBtDEG?_SdKS$TxByX zjsVVJatoPB`z8LEcJ{j`u20xUaA3C_|Cu&${~uv!r^TV5RI0%|i!{KPs*?yiS3@)U z1M<4@@Jeq`CX8?t(jOeU4B|iX__y-Q1R%V|atUL8?eVO$bHgv#)$t`0ApR=mRFTQ} zSqj)72^5+h5`0Q8mfQ&w(HdV<<_vp@JxPIPIzskJKUOFTgV&(Izz~fUlXl&>7=R?w z$rnyuDqI3`H? zCVNnnKNIdNsb>!qlAyBrf1{^WKy>XAcNGcvssr=tJz6%T%y>cAqELAkAbCRggRTJ6 z_rd>!pV-qQkthro!XRo}nN~SITvj5o_!FtWZY`OTY%+Q?o_HMvp9}7ESf6+mw;K_K zuZVKp5ptdd%$Ze&hC2ZKrtEB$-~=UZ8MFI1BKhmmrn>HlP(^ytOoi;>$A1wQydpMBqeCyLM*E6$H1FqKJ~GX55!)ua@?y;>WydOgEkIy*=MI4jlD zJ+%536>kgmRz{vB*&yWG5sC4q2yQ-Fc4>;hA0U;yfe8?eNhU{%WM2<>6_L7hIVY+d z-GUp`Cv!ve$i@UXcac}89zOnfE!q`fio$qw4nob3sl_mB8@wd@t)&mm{H3LryqpXy z+rC!o%ju8S-zS;~WbFJl8NdO^=yO|U!tf{i9TwDH2K4-7unU+dfF*50(QYq|8lJzaOq_F$wc52n;3 zk`YVbS~AtXw7u1SvR6G_CYkPGco{iB$+TG$o2^kS(=0Y*D`HnTYQp{`!pJwCTaAtr z?@DUfW$Jcz);6jw^Km{jFaJLA(K3903j5C9E6rf{#|*CCgNmI=e-LF9JS>~w%|@#| z4aU(ePxSPc+;7xns_&6I_{W;gQcg8eO7YCZ$i^WXn0DA5N~Qs6#wI@f2vH&TAjVr#-q)R$LL zh9Dw*XSMoH&r^fXZ<{knyk}6)o4lFce!|tbaHJA0+f?n~f{|&xg_bTnWH`Ed)CLYL zefi&X2W)P5f$P$PYaFsjF%sz~t6TBMFG<2GMdmZ$Ao}jA18^JV8zD$)Q0eDWvLCTv zM>|=?Sm1U6qU&!(ES-DB_}N~*iuJpX6P#^5+&mD;ZNm~+e_?hR?D-ypBnC^F2DxMf z#`J4tjWWED2VO zT=2PHgmD8P+z&;*F<9njW;Qa%H3dz1Q$5d-I&>vLFb zQg}_KufHX$=Yq$IBI~ zCs7CD+crV7@M=W6F~8jIn6`=JcJ$+9GriP^EBaXI8Wa8$r~?vxFCtjj;@O``Ne#~Q zF6ZP>4WoZl_J)zh4J56sQ5}LW#y`yf`Gw|lqObL_fc#qmr}28e4CAl}a<(vtb<&Ww z`pf$HH!sO;-zIBKP|*db8e*H0LY?N8!I9@x;|gcNWOlyXPg!s>@EWTZ++p?8qs`4n z386avg{uonjAcBCSMER5m$*J81>?Ry==3Z`AX1)A&o7CzU-AsscbcX8oI*lY7)vf+ zI+tGOta*m@s;RqM0ZXF@a+aalD5krf9S#ia>0E@LNB;1lxj5O#gx-GBB&)lzbS6Q= zB?W+(WnJjCW8%Q2G-0ThK180fIBx2k#fsNfbUd|$bgK9MAH@7ee%a_?+)U476KHyA z6wOm*xTe|TQ&&SnCEz-O?wg6|b~s&MIyE_0TSF_N<}>ngIj2M32xgH1>|Fi152}T1 zuV&%6RHqAC!HzOv@Jj?1B@~<3bUaI?MsFv^SY;;^e|3vJ-5(UXan(t##-vcSD!*{H zM?Zvr(5}~j&Zt-r#|9JLf*x&Q`?HM5713}5vi-I2n6W?Z;wrm2U|aG{do5S0kA&D4 zi?}s~m|Qp%-U{(^=<0asNK+S5X=k@4cYF(D#5xQq=rg(8*qn+#VyR5nlcv>XNGPoMV)t<|#@8dwf|g)=w_`cULJV(V4&w85 zRJq@=A;d!pCrCPAoTqMVy$kBp4u7^bqHoAPiXd_CjrhrE7qSR?4zlDv8LPVWuiaO>o3Stt;EuoX9MCa1Vd;(%oIM!F#spu zYhqwc&w9qm4Mf*};p73%t64zJt&pkiQQsAgqND_4lhCot+Wm~pn<8WreI{q*yWmr- z;fWXh@8o*urK$`sGIwG<@5Gqd<*+NO0F2o!9BYpum3Rc&-4!V+O*gk-c!|^e9EnK$ zJ#wE7LD}E}xYvwCxFN)hRNk;^gRBp8X!T`qU!3(d%!7zdt(c`5FK3|6RV6!B_x#wH z>(1^B$3!CaXS2-9t1{4jMa9TpJ*QOQL`P+m3o( zdbZuEUQ8LM;pY;OF)k_?y^Zrq{!lLjog24$bhkm%vZ+K5Uh;$zU4a>0|1*bfkjrkf+Gdd-RBb`d|^)@m9_ea=aB%_}Ie1N!%UvxeA z5{v1p2>A153`bHnUT}`ZCXU6I-&X~cw&7NU^b3mp1^1V(LQIh zfU6WB?Ma@WTC6`6FRjA#MRwU9*|0a=as0kI0(bb1B{O301T zsG!FxLNGUw3zz$>y|6F#fr>t-uPCDLyLmE{BN%t zAuyFa6!%NUEj`E;MFfNdMSHW4_q&B*rC0Gy{6|L!=iykIhvoDl52~+PD6Cn`nij0G zNe7nS8>}@-)b6EB0`^3F{tEt7IsCUUM6@Z{f67S8C5k@)?k=2nh*M+Oh5WKJ~Ez6*zROdoAA|$BVI(C_7Ok3zYOjrsJyxN^>?wQN84Ge zvoq<6sV{$V)|+Zr^UL0|fg>uTTwqNyX=uE>^z*a7ORqva;P1lZ14nxAn-KXOswltq zBU==2b5PWBpwGE=EmilOD!ubiPbu<6g*76j;^tUSG<)};d2TCpec8R5S6Ua08f&GF z@t}U}m!If>{|e3sPWK>FmQ&jY{ASfTr|Q*e>yFBJ)BKO04!<6jn;NhtqhlarU?Fb` z*_n#;uX2Y_zEg&@E3$%-%mGUEBA)Kz*`%HKKkMuFL%-GyeNpL&>2_wbss+AD65kMJ zU@wN>JYVueid3Q?-CsXH@g7!rTTkdz0ezB~O*H+9i`J!d!fQXp)4DI9y*3H;L*tk} z5*;9iX5u*6N^_3{6!e;R4HZsiHVZy+-TZzJF-W%2YDn~sj_`=5YIs+NZ&KUk+|^6( zYCh`T-F~?B>u5-xC%1ht01+?rX&5)M$Au=AD(oD}V5@GonOW?(l}3mw-`O4(*;cny z*I+3&IA`}L5{+3}O(_^y1gNEoqEpSfsl_9Xewri*!PqEI37A@6fU4EcR5UJTFo%kJ z+XT9O%dlr|d+2%5!92MrvxP! zpG3*Xk=s?2tfHhuS+94Mh^#6ZRYV%r6E%=9qidmDVJJ4;-%k*xj4@kMdwph)Ia4+& z>KayDqEcmcaf)<$^0VUMK0|;)BO@c~qHA|l$~55Ve7BaVR={7XqUxhS^Ad-^_oLP) z9)t9~-*%BtLeeLdvIYC#;AfYPV`k4aJa232*$7W~hAJ(QlylQuKX7oPTRhYQ_rXzl z+vy(@MnH!Ci&oHJfJiV64b-i+T7HZHN^Jsvh*7rpZHd9Zf-(3|eO}4$q0!wtcDClz zY_3;AuQVB>Z%`hR*T*4huol{E0i$yrcRI z>RoucNn(yJc@&jT9)F@WO~HGQUEg8E*%oowGQ_MpgEY1}{7x4k z2z!B~ms{$C_~R7?i)N-)77>yHT3Rkm#17f(XS{m*m;KC_JU^!fs(rfx;ly9TC;Sfx zJ(4qjtDTd@_niCbds(K*uK24JIiPwjDk@~cV$DLQ)!sfyPM*CeNredxf&n(oDEs8Y za`|`yvKX#M$(SidNxSoaZ!CBbwFVq)Wg(|y`vfbrI9q+^hy_@H6~cp$=}rpcl44P* zFq{JDl5?nJs7TH=UMUq@W^`XmfuL+HS!g{{z_y}e;>w#HGoB&i?MbfqMK6xiWFK^> zseIaphP*CynrpPMyj|>+to!Ol3%({bF{1;$Sb1QrZmF8oQ?Eu?DWbbJnLw>S=eu=ORc%B6;(D6=$_{ zZZluZX=MnqPe~yusF8C4l)Lw>TTte%X%eGrs=ph2u~4O;pt7{BBKeKMTXEH<(oBli zIld+`XS%rL!pf}^-u2wotFq#h`r}N6oF!EzudDl38Vi~wVi^>a-VL?G*}SUy&2|=2t7ezPz_=6r5wLlPY_oqmu+X^0p==Y(pmwE-hybd~qW< zZ;+26qdKO-Q-+9ITThOyCO=R~2hGAV2_XcECc$6*sA%X{3wf)*d#=kFty*Gp9_OFk z?IfBSqiPxV$s!lAMThiAejQsRlX8=D{j2mieHetiK572cUCR*X)kb9Y*ULmfrK$X!*&1Q)d*cgch_vqEcaTur|#Rvme~ z1haum)#CX$URUelpVan4td3AwUYa0cRIY`D1&Ft_ZjorMVvb@3yrv#i007+Smp17@ z&ouKf@<`5um_lOHVadGBb3CfWOZX(XqIU3)>I#uaOVnu&^LiE%D$L%NMc?Xv?rey0#Q4I zhm~lS)pToWAX`@&vUQ7+I+eB~8EQ)xy))xIhYa5qkDSkTp)tE{0#R3AZ5$+Ac@4Z}~7&p88HY*JZ|G)071d z_b!E{FOHS1CYeRrEeVar)$9A}w`_cbg5A`*;%Ju93DR2i%rm>nRmvok2(iQ6W~@c& zk&K&=?$;5B-MNpbXA43&LnXzSa!|q(NuoBhTUs6FijUQ-E|)155013f#y|6H7v6f7 zhUoH}2Ti}e4bjHyZh@R{tiR@IV7yWF4Sy0;=VaG~T!-fjo@5(~ z%#RvPCjc7nsLsDBkFUA*mRB%L4QoY;5>pNl=SjgGdY(l>HDfqRINMG6j3kHQd}LW% zhc&?xr>!YG^jA2v6r6jn@ zWS&`|N5eDXp^q>d$h3C0syvFvo;}yf1N5BH1><2i(y9C98^wnleD3gbGMdUFDS(xUOIkf+I(~8 z(vaEwgu|GVS1hc$sm19yHTEsu2gv=pF{sl}gRqF{izE&T;0OJrzUpyCPAq%n7h$@F zKwp$u9*3|@Mu5_*wty-1r9PtOi%*7867g0vIc8|z{x&|_V#4xdU+H?z{%5R;wxRi` zR1^r%QcPlx+PMToJKE}E%Lk$NVChoNFk5;`ouZu^L39&Ni|Sm-Q{;eTsV2RlE2XuV zFB8wluShctk8bt!lkPX>i>=480UemGt26U@>_KQS-dxwzyWVfTw_rZaI14c(ybnFv zr!?ttm356Cs}rc?lwMTKU5?gJZGSb{Tv#a*R9oZ*DwWWvQ)muxQN2~xBP7i!2;#Hf ztG`u_ywZq zMLiK|OLf9C$VK*=4Hf}CvX_eqL6D6)u(J{R15e5_S7w`5hHvfOrQ}YxB@X6fh&Z8} zNZX#lg^(lE*L-Q=w3SiWEmzB?yR)kO`EVm949fn3Wuh!tH_ldmL!t4baT~gL-KL#J z`Ug9TSp!nhf^ml3*KAU;*q$Ae@CkTD%9BCmTOP0-&CrJRn@Ym{D#wfzr-O@fgzNAD zr9;w-_NnxHm5xhD{)nKQ2Eh(F4bG%~!O+x~uArbdsf(FIgAJhb3IYMeBkH)97wg59 zZ?zm9Z>>PatNE6TJB1AeQB4i;{EAr;BH1t_rjP-WeJZu)qC1!RL2@&CFw5p{)RlZ* z55m3_i>^FC@yYDTMk#|EJoG(pg6oF{=?k=e`APR0o4J}Tazz2p+X3(kBrBTf;>BC8 zI*i4Bjb}aY*Od99Zu=H5oSIT2Z>b5$6 zQFN>)Fxpwz=$TZKR#%w|yu_3SywAF-p+#BjRjrt{724Q`_SkwBc&Nv@2srUu_mgyS zhSq9x6rK%Cew72w4RjK6e-T?917TYf%DxB1RW5wzQs`efHUP1yIAs-6?`^LXCtKGJ zgqFxXylJoz(=$G6tU|tvjampWYg_7AUCm<4Mi3pIV@~Th@*Q&6OPkFOa?>d6(MmJaqFWaWU(4U z0g4u@Q}XI5JhoaGd(ey&ifDJ&uLgFsZS}^=8E#B@bxZt0)ZnhxlJ3+8U}DBNzNoo#oj-m|Ay{ zaX4OM*ss}&j^4k(llrvv`C7)e-9pvY%7oalJr=O=5&u$4#L8{kTAefzcNn@L99m%Q zp?cMFcUVSPaGs!x5iTv?mCDGnWiov5_9L=7WgzNKzMMUbPHaU_&=%ex8Tl~FyT(vX zdJ{&RUKUN;KunVx!=oeCMFlQuDdnN$GxiZIU!@%tyQ=*C?#T-27uMzLZE7~B zs-~zoWAPVJ&`~Pg9~EWWL|#ujK-g)JSjJ`9ixV8r<8prt9Y_AZ*n7*UD8KJ-SW!f! z6zNm}rMpw41Oz08PKhChj-dq+rEAEcbB3;=K|pHg?(U&m`X2p$f4}>Gul2lmUOvxS z_j$ou%sSU~ookd#+gmM>}gm1p4xF@1>Y(zgSb)FN)MktwNif1UFVL zPKva)fbil0=dE&Pj(ABvbm5V3XQV5V$^l7D~6qT3YV%mN}l5)vn2bDCaeS3R8H>m_rQ`fM4c_0DVMmwYcFJ zP4t-+!ha>QzKV&@=`V0WU0IjIBuz+as(&+GO(m&0N_xL`QJfov6(iyVOBf|_bT5sT z!pLP`_g86DZ4^D<4K{IQA8@kkzJS)?CsG|P;lW!qu7UfW7}Y8^N0O3e!$+Sys!K^$ z*`HRpdi~t#z<2$WmO+p8fhxWOCB8jawgS&=>W@| zqn6tP?QcBmjcX@neQ}ohF2kL*C3EGtciM-)?y)wIZfUNh)tU)G`w+8M3HGWxK>9i%rFZAz6!9I&1P zW#Q0|b^mPrQX?+X8=(V9tO)D9DAX5ZBAS87V&fO9pGHe@mgCNo?aG*ke@_Pa=B$!l z^~ce4e4DYdy|x%g5q^;HAvI~lrnxFnJvK6SRKV7IVWHdnD0=Rs{&-HTOAg<6)|@8n zOnb4wNoN|-;v#?)0=pOw4{KAy39~llCj%oI*=H}@Q-y=YuYTKNpMo}$YNe%1)W$lq zF~SZlN#!l4H4q+?MS}K=`--yuoaIl%p3(GDdV;NzYT6fytJ9!OfSsd5py$mfB+&X7 zw=pekyy8a43k{E{quM?nWu%iq6;mWG^HVmr?rD{=JLlQ=%;j98x^+^wPC;GN>8zM%_&6YqXbY8Fk1Z_Y{&sPM!!a_=H-{}omx~$9Ou{B`mf_*9@Ekl30{n+!@h~BLsMQ=e%L7vlZo-~KjEW+RmKjM^7tG^d{h&@*;&{$w9F}KW zK+`1>L^MuHg6o-e<33Vv+`T1ecLJwlTZvIFvyO;ullB=On^mZh8bx*_DA3TVp%p ztJ&C?ihIqUYSc&`mGTP-LPrUTl3$g#enjMijCsCN+{b%%T}{ktigB^$l;U7lQ28?I zdfwIex9S`iI}6rA)0|hYjgNMpc)Hlo4tT3@JP|JdIXacn&M@3Wzz=_212WWHN(%Ki z*W>Eb3>Fw-t3+!?~eI|0{A!d$IgJ7*5?XuO5)0sjn-%_e1Ngm8@=nW zr2T3H^M|=g&RmD-6b~KV>!4knmaiJu|rW_7s zhJ{B$7as_Eo>E@2nNfg^@)i^n@%bmlx&3B^Vtu^>c3!;)t2CsD>S0S>yQDGb$(B4j zb9FNQQkl~dC*+S_cY=%bTqDGK*22-8kS<@N0Hy5u%jCX;B+2)6{*+To<0o1aPiDI; z`=j_o2bdiRrnVn+9<_ZCji;jB0EGQ0UU6oA%{w(^;}|f;C9t}?ey&=pvA-)`Dl@iN zkjU1EaV+XeXxQYvadsi;fheGzc3>R;T+jAazcVM)#A3VpBgEBaYu-Dev^+&wEG;IO z-Kr#M$C~earDiaqXs#V|SomYQ2BTny&)MPZZe6)q-dxfHVG`}ln!N@E$Ja+s#~`%| zmOhr2wISImUSs<#Iyo5zkUOiKB>#_y7411lY)4u}AcT$q@M&7;Y?{IM=7|oc!n+8hkrjx zhMt9;fTWZ08BV@#q({4h81Y4z{~Z!JH6?6*~WQd~>R-pGE{q>I~S z%aRKl77II_K>pTzQ7YSpRbrRTYZ6&;o4D#t;-w)EJbqv3<+yPSTQRc#O__v3RX`_g zir<>Sh^J(!iAd)7R&C-;)(OB@%uv7LCi#Xm&Dhjfy4Vkbe8l%seF_s#_MkX<9{a0# zCUf4k&Wg7MSo%qA!=|s$1V|!1wf;cV9&?Wlt<Al3Gep%OB5@$g-p345T< zJ$J8Z>3G03oN1u?(+#v))u;K8-lxVj7`L$aMz*4!Z+A<4qAMk39M5jvziyghH8%)S z)G=N)ztUM$1Q?C^K7>B>6+oiC8ETyE>S(vGW#^beU%ip#t@4Jk;Rev86~c-pn(#|h z^D6WVJj0}PVj)wCZc$e9tmAG4Ui$sJGj2R@r^4XT&_?Uo$r;}x^a)eaO`$409KkE7 zqdbv--`nl+keT&&%0q`i@iUYqr=(ZDBY+L7}u@tCa|4+lAbU4WAv)@ z(sg3iHudL3;ui}g#*(;H)@M09#Rl@mn&zq}JF3*mE)iNZ#b5T0Wf=?1%dtnT9~bIO zZ+1wLtHplSfzko~Y>)u|oeak(R^Ij+1=owYJwQK!))suDjf_8gz>=I+{cuhoKc z;wb9}#>#ac%$HZG;u}MBGX?!2dOE%9^6uDqs6L4EHx0UzIdYuS^ zXrQcs5UV0XJ_yE0AnOyb+EWNxMFv^o}j3plj(&9$L1r8a<8TdA_bbJ3Pk&&ndE4 zxgSW4roTC_OM@iT*?J5f2-k0*t0cYtnDA)vcSN%q7WA?NXd4Ym@Fw$12|f z`aU3I33)1}w(u<2ER7oy&B%?j!o!gCaEn|sV0CqFkfJC~uv zxBi0C{Nj=&Y|wo~eb~$bYYec_!RiLJ5u0DarAae;9#*o@f{z`jH&(jiFd-Z83bk$G zS?6gxaz8NjaArS(JT{@u?(*hbd{R^yQ<2!XmFM2uHM=6xdV+iJDE_GIlrZ2n30h?@ zLEvKktC_ekR@{Rk{D_4^gKxjTXsmpgv~xl69X(jalPY&JSe@z#Ow^z1l@iyDf96W{ zw7RLeRQ>JPV5SM^Ca)(heX5B*(vkGAu5G50-{&Msb7`H@{(r!OX=YPfjJ%foQ$#7;S$ z75~JUw*ZtRG)6>KJ=hv8%`DY(kt<;^qZk8EYEw7}=y}F}GLNH;{?Og7G&URDc9_O# zX=DoH5h?ao4N!#~TA#ANm#O#is@e2|!`NIBOT?O7YCBh9{XH>k&O26i#71UHt$c%5 zL<-tvcHP$fJqXHu_~Fl3h+GI&N*rilFYbQl$sxdgd-wNrU@? ztkxXgJkH(jWXdg_3WPjusB=1NbN{Z?p$CZ3qzUV6Z6nvFxd^XUM0$pyeCE!!jby95A`+yma=p_mg z?*6h%{wWpwrSTv?S=Rvrx{K8zv1%`K#QNb^ZOL4{eu(7W{y_y6Y=6x`3a@;Lf$5+*~C-rpGQa|7P0f-QTp(lRsHV z=>-SCd{CIf>cq6zyC13d$ienUgDMP5zya2T|n_eZ0~Y1y|@Cj4xgJ7ix5X2*U>HF4=&?fJSPtq z{qrE8&BsiQf+qQnmi68t()WSFztqRbSEhI4_9dK&seLhiFo8y0Wjk_dXYw8JbzJLj zn0tIZ^+$zw4C3>Op9j~WWD-F{mJ~6YyRK6zURA4u>-X?=9cc!8$NTQu8U^DOmWg$o zQDNM3vACjx4iP!J_k=lwWt*-l{O1>(2GOMiFC^qVht(T~;Cik%CHk?k1x>MX8Z0OJ z`{8q8J4Ei+4sgR=U+=6OeFIH>iQ){AZHKVx4B606bKlvEKv(-Wo{LXsNxTZC$0+%C zS%@O;2!5bY^^6lm*yv(m!`-#?Yc9~bBLalDIJ@pxo~{W~LHWTpIf_Y79v&7{pMJQy zb^h^U6^KD8=YUO1j%;G13)9EcAl_A4Ce!xt@e%;i!#*WOtDc^w3V+ShiJ+*`SMPK7 zZI9N3(o0G{!SH&hADNat?}r1TcXu-;X}BLzg3{D582X&%KYCi05au4!Tg7Bszx98L zz7|k)*;oeI;z((Z>r)ghEK#m~+O6V9(S?*aoyuz=YN_OJ-^qXUDo)@LBO&$ zdha*bMiw#D8XOuoAcrWvVh)yCT|o|mpl4OBKi^eVGs9i> z*vu@=%@wE~g;VcJWQH#)yS~|Yr?b>9Y^X4tpk+PCcuRNmOzuouTN;*7#K+dZTeHY_ zi^wWbar5a~TK#@>s_(<>q$c;Rt=&l`nL^Eyv(g1%hO*`*NYp^U*?{8Ak%&88*_r0g zqN>%DOY`0HlMve$M20QHt+o9qB9eZXOk7xm?hMmv0v*l z*2%xe<~55XpMGSgimM8!<25&|##cQp5?FDU)>qDXhEjOp3Kyb=?z353rA)hWJD%yu z%@@DqjUUaJx6TeZ_u_+p|1kivKBk=4vUY@P28-GG)Xg)wi#uz}bUL~`3IJd!=N9m; z7VQ)zpO~6@Fc#lz!(U|EZ)=Fsrgw#R{#q@~u6kfVmNvU3Ua}Mi#SX>{> z;Q0t@uQG6>)HPidkoCTp^Z1Zyy=IpAJ8ac`!0>k*8AX+Dik^$QBJT(vg6$PtM)0!9 zrBWuj`7`esBfMf^QOHv71Vm6BBXmVSGDcak*SeP=Kv4bB&}C6bfS5={|nJ2zK zN$Ge=cZx8sqg-KUVLDydl{0$UCfM<5-JZGMZeJ&5*RV9U%K17I)i3Cl=X@&O#tSds z(||G1*E{VwC)Hp|#auNIIVh0Wu@8Fm)DvY|7+Q(b*_PDE1Pc{Dn6cn_`%=EDye~~x zGm#1%TP97a%=zdxL!_ujQ6vG^a2ttS)WD!|eITzUxj;HV!|O;WIwf#*$zqg;jxYAk z%BNgqJALxpy1}Hb@75QY>2lMrbFm>VUMxEQ5C)rkI}XmOawfA16YG)i* z-<=+zT6;$;`t}s~=0!^;8&Nl*1thFjsr*5$u2QmFPX4nlwziP6l_vQ&yJ9Pq=!0%f7e^&VyYufq=7Q?3zro?M z{`jM!!9fpTB> zOl@+5V*{{moIAT;P)e$Gdp}JD+Gri)+#e_^6Bp{imPN6y#7!nByPg;$CcsA@3i#vpO+fbrkGghalm?@0E1!PX z$ibc5P;&i@AGWfpvI1lU?w37vr=1L{483zzFSLqeQrW!3Ls(nV7n~0}ePvU0&Ao<8 zsrD)*$|b*!>rWCH1Db$jvHFmquV5t|KjswRJC6sLL3X8Ksr?ZQvRTO@`kp#zo$g125C`wBc z-+1~VGnNpmP~y$bf&HR#0#T|Q)tmWH7f%y_9F;z~ZZlx-ERmxxKoqGl^7-iuqJrEo zsNJdB`sl^8J-RRO0j)TQ@<@kq(XV7Ql<+xp_A!=uj5joYpig@%n@lzSijUjB34>p$ zX9cho5v||DOY+1ic}KS3{A`at&4bbs>Dc)MjvUjHU=S(!+6NcrjHq%HCOYfzEk zsUqX&79gS!-b{NA^WS{^gDeR6?jY>IrN{8(qnIOw2Qt9<=gsMA#H zgTI*e!N26I)16ELqRhyw!|F*Ci@$nG_u`X3F7nIsb$+?AP`z>LYZlO}E2eVQ%PkaV z37Fd-e{IKQM9{rHV;6aH=T+!)o-{mLImemIH>{>-1#)1tJZ&{}CYKL3*XHP4fv0o+dF894+J78@99z^J%V%&g=1q`P}2(NA)iDzuFb;d~HpC@*7HIO8$L ze=$IRJnvFjt3q+Pjvo2|MR>G;rIjprYf>sO@AJ4bq?PZltc8m`61>24Qsb zs-I4^?7hBJg~rsX4h}c1+XD+#bC?-==|jMXGKm=!<^<@Q)gX0;Fk+lemAl+^@M8k+ zu0u6$FR@Rdync+Ln8Jjmfda`$KV1n!S9-}$@|a5_nAo%K*f(+TxO}v^8)4-m6^+bs zsL`Zmo4;~p7MEls_OFGjuL0y_{NcHg!L3d|Zj~<=ijsCIiuK(uw-iC}>L@v<1Zxt} zU8O!1--+c!@-5*P)5k<1+ciozgjR+>5~>u1T8o(%*!MR3bBY%1^>HwxP##i^Er)a- zAp~4geM^Gj!*7M&SdG;lm!m=qg3x7Qa_bi_L zH7VL(Jbas0!gw?Ar3P}J8UuTbigEto7bBm8a7k$Z8o?exTei@iHJ zLyXR9m;Kop)bHzXA{kG-_mWf_lP;}KiR2|!kK>ulMLmq0-b?q3;b!=QJob31n?Xfu zTbgq#!WT`L_1)%PbNo&-)nPcrW>F~%>sqx?8pHs8KndWHr%qp|TZN*@2>xOU*>2y<*{-Z`2mFtvf8fa#!GkiDP7{+zaZZAIW$f zF@5u}D?F5`(EoP$@J*ePfK}LrmwPO{`$=-Z27fNW)&x{ycL1W|aOD(Svi?vf@-wSm z!8x9fN=+JEJB;MZ#rD~{9-ACqY395wN4&Num1iNZwG#JpDRbg9v3CG0fM_Pu^J8L? z@ZuK<_O-oN=-8=2b`G-)wvNh8Z3pZAKxcE+4^|gUHsMBJ+0FM7D{Ehh*0DmE`)MBA z!Uh;=J6&W$Lh!tvEy|=SP%)x@Xq(YvX)5KU0*yqX8Tm(!_jVhEsv3ZWBH}t-&@-=i zi+FRnSmMfq@*)A^h(^K!dqbU?fTrT&W9wN<(EVh*I6|OwpVcf@-+?~gS}fI^1OV5O zm#@O*c)@E^kF%Dr;_ZV0{{k0njjR!(cC7J>zLx{J;%?(R_zCl^OJjl7`&7H{$D@JI zMZ8syh1#4Bj?{eX1ldRV^Xb;gG~I7!Je6tY3X2^^Ngi0mm@Eu+c~<2*;;zJO_v2*o z=`V)LMtYyRBV2{3tt^&EI-&DPkdO;U0!XBqMl{jZWNykp)upErz+v0~UdO9JyuNy2 zWT#}=opHXo4P}$jK!bpog7G*P%DdL0WB^X6`pzJlx0E9KI!9$r*vup0C0ZwH!3q_d zwr}4)5FCINAj2<+J64@}`78!D#KMYY?;5>$*tERmEAl!smJ8?|uvtTEBop{aGp{n| zkticmlDg(qu2W3jN#zxcb{}BY0)ybBlCpK4hK+PR{@+8LpTU0$V{T3~6&D&%Q1LJ2 za7G#X(o2ixI-fH{usSays-2tT`_EK4i~w)?lG<@T?KfntM$SL7I$G(md7n7MJ3nPe zTte=QiROWpTaf|Vtxt-VKpV~zT8ywS<>UosfcWIq44%YT@#J)YO< zwLUs0V#VpszOIw%mC?}f;<5ivCqFcM4VtRN&%hK4xJRLGLjYooWu78aln1w*(f7QV z{IaNzF>kT|^6ozu+!OuFqwi<%tINSSo0S4>BZ*Ae)CDR3?#TX1XFg3@_Oq?U>u6cv zNlf-KnBL~hni4?$pNrEY{$g9x`ImHxKKA@L`V^)BP-7B|Cakce<)$|yj7Ls7a#!cC zw0kaqjDP>3Q2E>Z=JEa*xO+fTo&YkXuD_b=-n*C|w)RIC@a4irnuj%lPg%%2Qa=KT@CbdIAh#zF}dhCVziPANk|Qc<_h4X|!2<9#Hx({|)QaR`D{dC7?5u zS`J;k6a7vVe>(=24LN-q5IbI+_;wX+_~H5AclM(`-@CiE4j2v<5AOX_5a1ojpM29t z&qkiN<@@^n-l7)%I|KL92k_nY)$YQ{!q7g(Ki&Ss8y`Ijx$DfkZFIkQ`k!tQu@@0- zHSrz)jAoDs9ftC||LdX(HrkA?4}kJ12L4N_lazJa^X{f|iR?iDnVRiC!*RZk280>i zD=Q-h@?V&t#G!`r%>NGb9rvg&S^k|suYD<*IoDUhKQH$0Scwr5V{8^yizSR6tm%D#_vy&2tJ15`QcGPj~Y0G^RLj&21xV zQ7X~Dgka8`e+Kds?3RIbYNu_+dzz;F&u#Acx$J$u@a>N!r4X&=>WcVh9PxiP$|K5x zAM6+GPz6kMXzv<%H7j%Z-F{xlx*xQ^ruT`&FJ%pcB{2k`?9+MH3Z&@}jWkl3a> zhvD0kmufaO8Qm`H(f3j17LS;3x|_Khn4kM84OqK-Kr>8`#z{jyjO1hu;%Tp> zRPA|X-CjsJtkLC4G{aXmjWNSIdIOw9JVNLi)zeSE_{|(3{{RtMHu_GDjMan%QZ}04 z_1Np7+LG_y&AjJ&Zwl5UDez^3pe-*@w4dl%VzK~eJcSLsoiz*plxjU+ zO#1vZ=zgPC=04TH0&l{4cQM7ql`ziE9rL4)o<^e(9_3-{-`<=Z%v#H+E?=o&=67>1 zH_)pO^(H?zS4IxwUZs4!ysyVy*1$Lk(Ma2uo06@;4DU|77OEL6EUfCR#J}%6_!$Z> zj9!lAnXA2$<;!!*3$KnK7qdlvWlIqr;_B197=m*H;SenQr7G)*Tx+1^z~ScMNS<;U z`ud`A`uK;ZH4TkN_wK#qAVq^*BPm6$x2I2&CqR?s=Xnp1EPtP2u~odk;I_OE3aJ5q z2s&#)7MkO=sgsa>gWZ!tU*+fVlY{7ze|9*lbj04A zhbw+m-kgxL#j7ke%5N|>E*!P{-#I>F-aGX({*Q+DBgXhYZd)UDLGyP2|NdEg-HLwD z|7biI=)3%n|KHL6Y(xV;|7kkkQC5^&wfX-3*~y2c4;`uB5oBA_PM{N|ei|qIJstk1 zC;5H0%fGa2|M?;8UK`rW_rKfV<3A()AMO8k_(vH3R}TL_lY_m))JR{3M6X{WSjmjo zkEB1da04ilbSalZ&%o0w;WcR4yd~BgO5#LkNjOR8ZfdaeM-0M2g6!sPs%cTGx zPT7km7o2*803tkah|um^a{JygdJ8eZl~OTr?xX&$P3wM@d0G&)sDDy@Y6i6#XLBr+ntp{R5@NW$O)m!e+S0 zKW56`#v_8(D|$Qt>4Bi3oDG6w=2&nkC%ah|bg)RR& zCd!5BtzoC|r+N<*mnH#)p#>j9Do%1@SxTK?OlL&Q;JW6E4NC~GXW2|jFqg)t&r%ya z3_DAvw_gRT+WAIt-NF1=AVvbbQ`tmmjpi+X>vR$@c~s1Cs=3bHUj-Y=on#4B$J{Fo zKmDNQ!>1f?*C6Agqp{xgqqck^aR9 zt;~}iD)^pWr)j`j>l|N&scLt0N{dPcNP)GjZ^E7diUv~a!=mGQi7YuK+qdZ+ZOeA# zXzjL_XoX%z*5Teii z0aRL>y)T?EDDYADBPpo?`M&AR!khJXz!Yg_;(m1v99-^!k9r`NR6y!UG1BsigZ6YQ zG0IKmB<7BMWPniccq_CaMXbcs+*8hg&)FQ0O5aUK4LFDzrcqpY(=bv&bXKLkbqthU zAcOg&rVqA6eeubBDm#DlEAhH9@SAl1hu*Gg%^ERv>G_qDx{NYrh_)an$}n0{Ywt8P z@u{qj%yMYg_xs4tC$OR`?|+#!(e0EsLiyubUv~ai*XxeM%i_G{w-Ydy1K( z`drY=6&3utBvI?6pev2LH*Wf~^Y46 z{#$wb)Os8jU*rHA(cm9)E!xx4G6@)hA(U-1p|g79OUxcr~JD77i-r3L4ePe$Y+Ej+&K^ z^ebo{z@qX3^+YJn*Z-)CXoBpF|zx#FK?r4F3XDM;-22J3!hMylz=Pd$4HMIRpo_VxBadxG{%BkIn4l6B7=9 zHUmGI&YRtOn@dQOXV%Ejci8CRtB_QrY5-?(D(4y?xxsEfie`hHxnfzy_~-NJzFkir_?oMlT*H9iHs5ZK+_UY&Jt3d&VFgisXPW9 z(k@48Jy%NJsnSYZ)&#RE1|JPXd&Lt=&XyBQ&!4K+Ig2zZ>7BhY(^$e-i`deqP%1ca z5sM^D+pbUdSYe7&O?B5y$YxmvlKJW-|M33GoinBE(I@Uby8OAika-|XGj+JVRZ{leYIWC}%VJaVix1 zZk}0uqpeDW9x*FUaHKL@X?6ps_jcsH&Ire_&1NC<)i|myaIDD+pefl@d_~E|Gp5VI3L{VyoELSSyC`cloy_ft$iVdoF1Saytgwqf~ukrUp}Q z_7UDLmkEWfy|7~1yZ)-JMBVHcVwBb|Mvij?L0|$(>Z^qPSFg-Vc)fEJkt258Bu%uw zaT%O)oNapG=}t=VF#yvjdU+7D@~-!)D>Y?Qqd{mXjNl4h%NoQ>uI2S{d@f~D8nYd; zPbxTFd!bJrn6H=pI@vigBVNPso>2`7d@G-(n5Mq?fzQ)hzbRCmjte`!om9&k=X=3{ zZaK0%&!fj?A|xnpGWf+eB=>e}?Q0#ux!tnL9HMUSg(&Xev9*>A5N{w+#JO!^%!Oxu zxtR6$>!60xgjaid>|*G&b_1vmVpPV&(R``0^{eMFjJiybm>6zynr=~jt@T-+t^7^c zNw0=%e`uG*(nHxCt29>TjHmBUI1{B4Ij`S`ytUCT(eBY|HCFUeoQ@XKSHFH;QShst zWb1N0RKZ*KN1cYRCPiv3yKruVi1Jcb(y+3n_-RMJ9C5`J#esJ5-(Ko#p=dAS;_s(swcOVoGCW~LW7`cnCjQW-@ zJL0&zMx93(8{a^kRZNk#T?Y3!3pG&24*Q5)^F4Rht}XWh@0mPW8C~to97b~Ubmp+I z%ZX}Hq@qco>LyixXXv$^sj4%X!1B+4%*2~3Ef&N_Q{2EPy z(tTWw8$aIM_eSX}WUaoIlJOae0dj>{E(HT&c?2!z;K zYcw|$vOzU+V7DMt3U{zaHGxaKc`Ie#+oz@p@N)YvT%V0~7DZ#2#T7n<<9Xsx zKRkl(Nj5EP9B3gQ4QMKi;61yQXhin|g!2&it7H<9|hl;qZjY93y|Oqk-_k-$h7& zcD7T|pyVAf7t^aZzSh#xAay}G9Qr7EAPH&uCJC0N00 z)bkBiM2KQG?e7b)wCCDa5;1X?u_s$dQ(O~W^I*sBUk9-y1g*j<)A}DqXXzc}J;3Jk zKJ-%-=-tg6-6t9=3-wWL(0Nv|lcYD7k}N^vWbrxEe(rsgu_~qahYj(<${Up5M4=$D zBJPtk(vn$ED{$&5M~=uzn0$|0{Y;$6+N_pZzOlWHRa2XC`=aCL_W=)g>(N#c{dyNk zXAG#7&S1LuR_jaecZz)0p%z+`Wk`!T@Q|%6E-=|7RwWXOU$i%-ysVeTYRILWmrT7@ z%lpGp!Z%rr2U+DYRb$E%=qY9 z&QyT|3W^7gSBb}}#?0#BX4_e`qfTPv9mLj5^w(yN7Mu6vS7U#Zo*o{;kU?NMj_QL=E^=oIdp{lv! z53*HqQkT6h`A`&)=dJl}zGfr2g`D~pP}Nt6+TYIBDf53#RJZG*qp zBFhxWnua2%)^F%&2|nX!vJY6y%-C-@jw#Qa$+uRQzW@d8r_{$jk2=t=rJnJZ1^wXq zgsv|m3SE^37|)Aa#4`2oto|EDH%p6?4T9n*C{V^g*pGAgE7o z%MNh!vI0yQ*j=^W4?Tg5bL|6$Es5YiJWZz>iXCA`%`|-HHWRp2`5Pi@YlUvUcUMJI zNK3CLaSmeNGOxKv$AmxHGBa=ZB9O+0l8lXl4RHiTcurH48(@E8(Zpp^b$;}8UcKoqxFljFi6(vZwOQZ0*bN3vsS~govnD1d`&>?M0pOx| zmT-y7Zy2)1XKgUwxgn}%03|93U)nq|*e88E`?EXd_o2RSr<>xTTC#jRqoi{Z1xu}g zYdA0$b8wv_IcAJlTB~pz@twIxqo3|*cGjhlA28?C!7I)&&1G1P+GiJMwq;ZSu~xi7 z4OQyZ56tAFX_}7Jqxm#Idsm%nmnfP09A<%u*vq}NFl;z~c$h*zBW&eXxdb{d-5a1wJ`x66eV-Mf)*6|i)8XpM;DoPgfFy2&D#XJ-V9%COQ%J8b|TIirD=ua?wCxGG|aQucqn zEjeQ#5It$?Nn!l8&5A^|bEwp=#IuR_px_zT!M^isF{CdWU%i(b3lZb_D8~y6}8d0^P}CbntBmBcPUM8p0YrMdH-HIBP?Hw+3lBGye1R^yyu=jUBkg8 z^4itsn+vUrV9Hq)nGgZ zWD!gc09)*)7IzkS^x!VY}ZIIDZ2y(%M&W{QSIP1ru|;`{M(IH4mM1~nB1P&0aY zxXM`lADzF@MtJ8TF_jUmw**y}^|2}(YOtrp%Rs+uB?02(?S>Al0LI_ZD<`KNODH~+ zKjDI1YzHiAcUW)W;P%cLt9xUzC#vnTHT=hGRr!=O*I#I4e&EjEp-@RVuQ4S<4l}tT zCN~M^6>e>oS{0vgU`M`lT#huR0!;`PYWp`95=n8kO514AxYk$gQs1T`in*<*7Q@xd z%3N&*N2kJ>x3ly<*>UCds;(%F6IKGm70`4HFD$V7CS^bKHl&AC)PKP`pxkykiZwMt zL9+%FDuxyiz1_$9)as*0$Fyb|h6`naYC*M8A^ zYKsp?f}<$SH?y*xZT0Ev;!tj_XlDNcLl)d|d8py3+ejPfsg}-OF%oSfdXW9htc}QB zd4_gBRto%a%mqH;ydgMd|0Vr6DK89+G=PY`smP~uQhDDBU9AMrn?V>=PHRIL^mrHs z&;lA%N4tqIviWf|R1x}SBsuuol|_K?rDR}xBKuG3aBq`uj*)IkUWG!0&mT+rp0X6Y z_H4t`a~`39jCyoRljFn&;)KcRswHGYU80OxpPFKCdukzA$ejOmJEriqWX3SH2&%r?tEHw@{VHqG=yWbquSL%|>Z)_D5`GXqR0E0C+~FLC@R0$Q0!vnV^#y;x?w5 z2{}D&3nlSRfVb%W@BsLh_@A1@IEIYsdNC%W<&#biv-jFQhJ%XYOf<`eosx)w&(GoZ zY>*H!FOfOW)U&Ft!m|MiDkL{rFg!%)mGh1)h$)YE)1aqpd}ZUzMKi}0A}Lg*sY(uv zz1z-16?)LsIeC`Ue}ae=H<$3sFw_&f2;-qK5y$j&BJTxTw|ko=W!;4n-{fc6nZ3oB zR^Y^o0H`u^J9#h6_9Ui7DKXyI9NWwDk~SJAMAsR$Dh?qR+l9e_xLQZmYuO{4`nx10 z*qqAhpyAbPS%Z`g7!#%IX(+LY#iAx{ENLc<#%(?bEaPf2*T5>!Dw0yZ5Lergi%$LP zj;Fy5*~M*_&fPWudY zA`c0jzn1W2Qr25FWXH^(BnXQZsOOfRTbd`}Cutrq?!!bMTVIOW{KqOZYzevg(X!>2z?mDwc2$qf@f?&VX?nXl<%!Vkp}Tq*T^v+7n6$U&@avI60Dh|kK{~4%KWmU zaSN|ZSWyfPR>aM(J8ug&?+?M8HJwV72y@txpN4lzMh|0uMILf`UFG^tYE9T-^C_#a ziCgrPn->Al>izc9Zu5KAB)h}|e0Ja?!Ff3)bs4u-lg?oF7nGa^H@_~NHZE%vVoapp zW6V)10FOLn;;U9M!glo|H8T6h7OJ^U4V;&LKjB4x)e44jO3)P|&`l8oZN?gjvK)p$ zOv(`(tS{C#1lu2+1=(sVOOs=D=Jr=#kSD`vy?;+K$WIaHneGtYX$5+>@8l0Tg~SG( zno^SGQAQi2MwfT&S|FTuTc4oa(rKM{Z7amTn74~V!=O>dFO6vSHE|9ZN^3b7yxq;tbmHVMVmx}H}Sr*N%mFFb?7=Yyp6wm@3`-c zcm8DT!QN}Fx#s-N&zx&-jkdMS);VRoP2k0m{9gL6jC#yGdG$${V~T@c^H7%IFs?zb zr6Skn6?)LA$Yb-9(AN8dgF?M};MGMN`IMxYQKn`c%4F((2mOYek&zo+{f5-$aq<5B1Sl7-p1pe2g%KGoxR2jx zTR0;$Xi1et5gQ$VW)FMSa^4DPxSvrU85!+YW$z|5 z-pDx=&PLGch>5Gzig~rpfqX}^dl#JF`X3*E**o9rkU9?M8qK1>&{fPWTv1Nom%ALd zJpW^-7E0hP5vQ0|w=rCt-P@h`VX(1ml(zSroq!pmTe>1j9G6s!h3$JI2R|MZqi+AW z_{rsX2P6Eu!QCtSd5SBetq!f9HCArv(+|HQ=A(@joF66(y(6P=)?}^i&pG0;jZl95 zt&XC$CjaBffSB44k|R3h_Gwm*$qQ9*(SZWVzRO~03%3RvtMc0p1>RFm(&e1o%6yak z?Z`^M_J9*r_?1e1ZgzvI8e8heD2dPf%ld)tklZNrhg7hU%u1WME&oPuzfZJLUi6#{ zmhzF@no0a>$H5QfwT@4pe$?=9#_brVU`26R&t!!6AZ8i(%h$$>zQ9zilD#yc;TMV; zN1;b1>+n~^;c=EUi9@0$@CoVh%rCyqYj)1Z>;VPQJmj&t!KPGV&4mktA|MK%A3A&g4?lX{X}UijW5?@W%y0e6Pa zay}`$&MB2LboJv&`Stq^!OFr{Mgm1_cO%@1NAg2avFco7)3FIhjzdj*9v-i8HnQvb z@K!I~$H)i9@MAWkqZ&k10Dg5VIM|SDq}%X(BimJG2F!KHJetEcy>%)#u>z_9*BIpp znJYDW&pe5_{21?de*fI~v4Su~EU?=n8>Z01be0h0hjF|40jt{%GRTo475sZXkU7!C92}Q=0~~GdG(JE*lzZ8pecEoN=9B zw5qv%lAZ%HbMk4MilS6k$Qa$JCo=2Z^iRv0k-$@OsJ@Qojq(7E!QeYjvmhOfDSN(G z!!6;GC|?X3FGhgqi;kUlxO9`gJ~|J)+M}iFT5FWoeaEeVT3hX>`(8oR=G#jzGryf^ zJR>tzq}2NIOBCDMkCN0?wY3K~7#n^eEFyi+=9D8r=#2-uy-yaDz20*+uI~&-J*R}~ z_l7`@FF1^@FJWRQB|>0f9%(-M@Y_l_0wq3my0juYT+d&VC zJ_zSGm+-<^<~)-YlNm+2hIpSK`Fz?c0k0ji?jUp&oA?KSS2K%nE_OLv7#&Awx+y}t z>>9nSav^@BOGPZwlU}=dOW{VYRUS0(|0-!5U3m6hEA^KCE?8R+*)%Z;O{FBQu65P@ zK6UWHCpDq*c)rWSvnVm5&~d-&YVom(G3-@WG3axWg_AJ&0}^Jr&B+#PQHu>0%bG`;SQ?Y*IIHS{c*S!Iw3?Q`Bn7jO>G7f>I0b zUzRlY2nlQ4Eg0Db-!5W?c-*fSd4LZ_`ER=%;6uDc_g)lg#&T=kVIpA{N=c822zSjA zm$WF8VIO@qOCQ@ALa+r%wCW7h@bQ>v$_8g5Rd@14@O^V{a770sJwLm3=#caaCE&4) zP|U(Lqo}ugM9F2f4iU~zFPWqLPPZG$@oq7BD4BY6Hl3~sv=+I;JV&D{3+GqGxD<wB(S0VALvkeh6Pe|3~Nz+tg;x8OrgB3;VUy=!YZSK*+N9s%-M zu4pMbcftB0aO^$7&1_5s zZ9v}xRfI8%PR)N!V1wA}|FH<~nGy?R&pvE;D3_`e@ZEgo=_wmCw~aHGmp(*Qa-ZMB zs-VkwG}S?1=4HRRF^hR^cz&C6%BbX##;~(L?{c5{8u5Debkj41K=(Fyt~R}zXxl*F z;{m%cMvqA=VNM1__kt1@X9kB;wW$~Fu7_1fnZxWRuZQCVEw zRXw)CRkA%cVfjFd9pc?YR9WdAUH|nr`J{P7yZunzc^0UBOwSs9gC0AY{bWAu2S(-- z&bxNQsmr$blhzKUTgCbsnP;(Czqmj2h;p8i@G`KwCgUj7COgTFBa=6#5Rt{NUk|h& zm)jX#oy>P(cINfOf6R`iBBo;LTU#?oi z)OU)-34PbO#FgL=EK5ll#_=Lv(Fg<7v%IUI%@pEE5tXE?teMrraeEvIk?k%zqlSn6 z?D?1-vXPPhR2imh#?#;XHG4hOX7jF;$Pm-WrsIcZou(21I~#btP(MLxwzg7u-hA}_ zb=Co~vEqxZi(UHQ*boNxfiHIChWV%OqQ4j7P^()`13U0BdKy->&xtWqluqGl{Nu5 zUZ2lIiW?L|{?!WcIe!jHw?3dx%9#|@lov5n}MBXmmg{N{X(|*R<`kJ zNf#yG)2s1ft(d`rIStU_i$oh(c)^>Pt$J&cy?s>4EiPHr6#SgRegPZZ5^`7ATYGum z3$0+cm68v2SfCw)nAs`}6(ZUK`n;em8{C!q%r}u3!d$&HN$DG`%(s}JI)8fy==$rS z*K1zirKm5UC)u0|tgO}-zv+mpg-&n!0OGIN_UQby+0wm$oJMR+1waBu4dWcG?9;&Q2ETvNI*B3A!V^Xk0$U23udL&UWTZ?ZH)@0~BcfyF=>Uq}rS{ zxfId~J|-)_8o|xU;{12s=_jLO_-j&bM5L1A%<=K}));VwqvaRb*o7nG-~T2_xfTnE zGe5%PaiGHyzy?vO>y4Rv^}DG-!AJG>kG)UGrG!SzU3v70Z`>4_zQv5Ai1^!wwvC^D z=fNSsP)>@a4Lys4v7D@^|J&e_cr4P|<6zFx4}hH1o05@my7@T81>@1t00Iz4<4t4ezC1CV2)% zn|vn3Y@vlcr>?Y|>YIwsu7fx~MqYkJP3tvd*G!B(&)+>CJ84W^+L+eK&7$)ok5*~> z_I^(q{y`6V^kv~qh2j9ijmTk6?d8pQn?b~m+NZ(7_|DSp*pt0;il@1Dd*#Z~qE2yB z9G^c1a%eZkrC8atCHGZL6|JIesWrR}F zE_Du$xfs0j7c15hbDd}CE}1;-w$x>(SJd+O&I8&@sy^=+eQ-0F zuZTe^M#0D9R9hrGSYpjWKswG!Q$o>vv8v!xn*UuxHppiCp>3sU(J}C z-mRiem%jRovxFunqVaD=n9Tcq=FQFo39KImAUO4d>yUC&Ex~cNb;c=ki>7E1i7PhVs)ds(dsl-8zpBYZ#L|qtc%2?r@(=KszoTJxN z0bqJQL_j45cyPlV*1q*r!$6=9RZ_$P+^k=U8ypLHg zY#p_$!T$tzEA~g#f{&x^kZ)uyQd~$|e~F)?{+8TocZ_0}qNXA4(Rdg-(PJ7PtD~u* z78gyy^?B#wy|W>07;k5>>Cp=MKg8S3<`R@YfKvBBmbWD{ryG`h#H6#UXyf&sb?QJD zRhwQgCDkom0A4486E(@DwP6NWF$y6u(nCyjH4NumeKD|5f@7$|-6qVlwH>N!= zTGLP3$1miQlk2^$E}W0a(|tZ1PgwDezO+FVr668ay^k`7sc!OR8JbmusW<`^DlzKHTMXeW@9`K?NVQd&|U|E5S9(8-)PWmot;*$O}|BOU1_})(h`z3p^68t8Z}Qhiql0P60L8dB##F7LOfqIMVdVk<=FZl z_LQn~rA*q~$hi10(VhR4Ym_lN)rv|!&wsd!Bt5iXhI8tfgYJMIn^My=@_;;zL^IZt zKQ-EwIt zY^t>TH~(twN(@936H?(+4>uVPO%0skR7|Iji!l!%Mq6plH&=v*U9-$ihO{E7Q;J)f z#}>W(2zP$`O#QajpIgxmch726hxTk;1Vt@vCg*ePqbfTU?F$9OA}%FjF7We{CdV~M zoi@I!g%!2mAR|PS%ElNsZlc+vf;rC*{hZLZ>up*xGb5r{`u`uoC>v!?va(5HR6C4t zu#1_(&mR;hVjmcsP8X^lrtQrUf#x1Z*|B*rj&Wt?I~lhp=vUk(M{I^Ha*G5j4|-=A zyI~Whl$r<;;NFCV$66QIPaqGKZB>L7UZIJ=eG)^oQu@ZVlKoDx9@Rz>a0CEYQXDex zjPH+=!QvEqUzz73bzCE;W1vFd;R;-X1P!3|D&y?^>b}US=am8y;`E%3IakTD{bmXG zNGvIf7Z?o)R}E{P`@hj)bRv8_*48k|F4W6ax|gixJJZ&zAO0;C`OhxaK=7mvDLMa} zHae=+lK;gGt(rN*Ym134mEGv5)$;5FbP57K3`ncl&74Np;mf;weY&1?!?bvBPX*^& z*$K_Ko7F=x@sjSflkSX{`>un_K21^cSy*c`dAgf1Ppu}l$gWuK7gdh&TKD>RK(tb} zKc0xN#sVgP*_yGjB|cxq?bNUIKMFZpo-&8azOf8Tm?Po&huq%Adutj__wqw((`x;^ z^G~ZaH{1@*Vmfdv(@AB{H4V4L)M(K`?H3nW69G=Kcb~It#S;F zl>fnbY!E%~H?)58{bXboYM(*?>q?s_KnAL48TIv>^K}UoCKiK9c~|>23HkX6y}h@$ zRn7|!+x=blgQKHKV>#I-Gn|%Rs)hcY->`$dbImHJXBZ>zN*Gw{-xTjnx9+LU&h8zR z)hp+mil|eF&#qEv91Gpz?83m)W-sKEm;0;{6~ReAI1WkX8U3(E6;sG*M5wy*#pwrG35dj7oA>9z`YCj6VjCa)B}w~HFTw%N@t z$iDa*W;KSI=!HbXdExRagpeYtu!xXufvj@(gH8-8IabG4oVUVLzCHG!iFgafE4 zI|jt=C*C3|gM{J_4?7Z(q9cT*e(0&cca)fQzpr1nHRr9jspYJw%?wm6HGmYe4~?qV186|m~GkB^5~ z#JI!_K{fJ$F=~h*Fq)zLp9+K5Z{rI4qR+!4KMW2zI$0p zRL_~VUtxhJ##LrxJL|%#8#rVkn)=|$bD=`&?d7g?->kA$8VBG?F$expWD&xSHqq|Y`F=pD{(R?HkN26m5FE&_CxTf~rhTK#J z?AxCXllZS%=62F6G)77aTl}&eQN<^b;OBkJ)~UIRv~XBJp85i#Mn35%D4wd1gm@rFtS~QxEXpt8%I^~f!?F(P3E|*HxFlbjwP?;YdSmRkZdq?kW{2#27 zGBvtnn39sDp|V*@m+1ZsweN6X`Ht$ne{Kf~LZ?n zGfJ;qL(o|E)2NO%0DX*(F_G+X4bk#jR%8Ii5k6~vfy@+~<=t3U^`Rc?81*4xE)OK7 zlgO*AZ5MTPY@rrkkbK0BoPJ0-o?kS08=fL4zcF=*F(9Rp{>Lx1G-wk{ zOqdKM&8$ohrI3m&CnP@CZo^BIK@5Y6JKl3Vmi82F#`Drr{BIlLslBhS%1nq^?-ih0 z^gmG`BuJWQWvd`TMV&^y%>A7$NY9;RV?of%SY)JJLK_Z-^@D(e`8tEHu!S$3EwbT@bIdG^%(f=s?m1tDB)O?sH51l$`bNffRU z=ot!!KNWpw-J}$_Ef~eY6{PVXL^7py=>U`e6b|#W^sf$DmU)8o%(se!U_7*YO<5{D z`Rfpc;XsISD6jwtIR9>k3JdRmnlD*&QJHz$wEI}7Uw2v*8}7&=(sWrmt7OZHiB$Bg@*PYrc+Z3bq7++m0Z;W9gLrj z4KkZWMMv_{Jl^yrds*em@1`w~2%ip$%waCjPJXEwSMmgD0QsNXdbgN|*PvnN0rToR zc-kBnI)%MwkHU(_@9-+I{3=KNq2Iuj@$sf)&m39De|#V$9Oc-CC%_GEEzv06J~6d& zkuyPC#O{{*#k`)_#n7hWkrnJ$i;g5#q^>2-ipfK_HApFF8=P{rv5X5ol?X}KLpD|W zH{!};?3kB?UGg36 zEG51&6j9xpt60estl*he?teQ8F!9*0pvjgr#c*-w7uH=Ve~RzB!o^SJUxl1%X1RwwUK{m?f*FT7wtxcH6~J)0ssE)S5!`q0h|Cps*QyfblSMqY0?ARy zuNj`I2>JTb{h}IFyT0rzaXu7YhG!h>l0Aw+pZW~8XQ}0B*qwFu7JM?zMpQ1*DRMOj z^O?e*KLD1m$yDa~m8J%VWs=al%Q(DFss7i2T7wjR%TuJkIAwKTX&}c#)fVjTbhonw z#aR32W9(80ew?~0)=DBIZv21G|3v4=e>K{x;lS;lBbgC&$y*jW%dRegw?Qk z)S%^`R8TDM&0dq>^yF$c@X_rl(vN#Dd!kKt{dOmVc-MpsG&O}cS#)7HAfowqdfO7H zULaLI*Ds2l``+7|znUC|y+8YlirYsR0;^D&6G1|{q^ZUmH_f%`?xYkQA0I#4o8^Z= zQjhMzWn0ZN=b2ikt-h11ASBb)f7aZ7q#H$IXJRq6c*kh7QA&@mPp9jwn#;*O0sVys z|ExOSUUql0y&vYu4!if3rVjJf4L*8gZ(=^V>@5=e;5}!m#ipAs?0q)!>8;?AA0-AM zmN$zla7V>&`eaC&m~s=wI1`k=v8p4+Oa@6x&>Ep8H3jaD&QX-}?#sj*8s^kMHtB&o zhrZ~b0-p0yl+QNL99#IN${^!ip+yB9t$cTe4tSY=5s-1EkpX?NE5BIebS@r6hI1w| zT>XH;x$osvo(juswc{@p6copUX+6YUb3>t(R<%~`TZ^g_^fU9@=&Pm2Ky{?B$FC^m z>o?yrt-my|Nm_!9wewPFwcY}F`4-N}JHkURUqd< z^h03C2;2j({FX!vB0E%r9D@hqxxKykBQJ1S{M7kD$0)>li7{D`mQ5D;@OZlU{ukUlj{$2YaeQCCuXPNB+)av$Q6mb?9 z+We0TFfZDt{cga|DUg3CF(prArTkHxN9y2cdcl_5G;RD~UE}sU@IpWxNvyAvSrEf@ zYOO(7w1|~WZ;0eZyM+oHK9`vNP-Al$hBWwVR-=0NMY&c*U2$Edyo z6Crgbo+K4iAD3VMTmRvw3L}WiS|fg!LantL%RSn42r%79O-IE~4n-AJz}{yYQ|`$D z#Rn4$b{6jw$I2FroR>I%vR;nV4~R5SaHIGTNfwT?u;qWYuKbXgmEIz=s zY)Jl+VVO0-e6%FRflcX?&ikE@zz(D0?q<}NRd9OGT}rwU+wQQNT(Aab@p7TqcVLMD z3)e=EB;kn|zW<}v=%hm=xg2?^I-bkez-r9*JNp`(ybgvy*&XXj3^g1~UL&GJ+#Vf7 z!#h;L^>@{Je5LJo-d{vKvLrm@YMMVj?#bn7=+G(RwB|7_seS^K*Q5y>P){v!8TIGXc<-=#RAV zSHMN2OKK3xuDeo_pmTls$i2VcJ zFQ`Ex&4@3gVXk5+9;BF6*a}{6 z=SJeD{pTq$3vE6uypb+_>8`XjSjI40zt+-Dd_i?-^mhX6bu=kt&&b1pvY_s4VaVO@6_j74>jz`whO$&7?i zOM;xA7GqO#FDDG~SuNErcwL{x>*?J^)4$O2YZh6%LSNK#KF#tPl-_?+%1DO9Lw%|znN^IRV<0UZ}Tia4P- za!S5(GqN#Xxjorm4~<-G^SVu^jovkT)LkX&xZ^k+y!6;06pjKLgET`#*M%ThNNJ7| z?k|e44slAK`MD)+?}v*%RF=9NRe|Z{8*h{+arD$t(tkHFOzC(U&6E%}AOxGQ^bV7u zda*iAc^1e-Kk;s9khs}lMlZ=wgT0MmhwqR2+Lic(UA7o@G;}W*+3MO&w;RvhyAJLK z7ewzYn~YuzsOZLI^8=<<^vr_WrM1zoN7L%ATeCgo%6CpD+jC> zOmBAnvg-&s!ZO?g&nO!4!*V*}0WcX{<3BD}gn`5j8Anb#K*p#lY1PjxTAwIX-Rtr5 zPMOWLpf}@KBwrDZOiR-Wz`TXeQDajGlvf!}{Tz8?3axyX`Q<18X;@pa?=QXt^Koad z$1Zfg?bqD{@4l{?$b{zwg66WkuNZB=ZgbbhOXSRArUhbyg%hb89AhYMCNKL`tnbr4 zn!<~Pa31pN`>i`|ME}wn-I7D2xTb#sf!Eg$R(b-@YRFBuPn$EWnr}6gyw2KLf0Td| zaIZ9I|Jj}j7N6|L{TDf|-HP~Y{M1Nk9cx&xPXE8xO!h1Xqjofd%FuDhKM5*CeK5YaC!f0v3k#1GV3mGH$uu2P>YU!$Xk#2P<@ z3s+pSd%v)Dj#sJDI6{b<(iYbWWob9&<#eLDJC>~4 z^P=tap+|q{cH_c1ym9Bb8R;ZT>5<#lw+y*bpJTlm&mptYPIoTL*G-M?7w}@C4)D4e z;MqEfR3zoZ8iNW>E2rYug&ru7#40k)@OILZ-a8}qoNf;GX&Y(|rmJppo?0SwaN7>* z_BSouAuF08^@`)WClr~hcTD%?aNUg2>@~A1cIm!%Cl;Gl)7QtuiC(u?#Jp?OxRTFt z_V_G^9l>z-dfiC+7eHL8#7l5e@Yt@h`XGolKzS%X9CeLj?z|fNBjDkvCU%-DeDR^d zy({#0ioSS93b9v0-mFUb)?vv5wT$f5LLBKuxbwN;^vU?I^jCo3m&2d1&fg^PfUM2e zvGdK|A9uR%T~#*U=RHI>M$roI@+9WD?YHqzdtdR$u(<4{CkBvYcYZ(IIYNBbZ8>#B zN;6uvdQ!ziATbwV!Rp3yYJj-afgaI&i;8RLj(vEy3YS$J&Og_R@py%m^kB95v?j5& z3RRiA;Pe?1cf=_Pbd}j0F9q}(ij1WBZfG-XNG~WMq5OxG< zxr4(DD$lo^u(-m9NkZx+N#ag_+i|6`&W{QJRlG#OA^$(3yYWb-Y`HdKx$NTNIxqMq z3zw4WZ=WJ=k4o=*yi=t6q}9LYyjq2qM^FADrS=}gGq9mLnG(GNKFE=|)BiFB$%js& zXb`U~?38YGz7D+9+oMB|^pC}-S*-ObG16#sxwzPC7!dX(U5Al;XJY(;PhvyZgiI_IfAa`Czm}S7N!FE zF^YQvBry!9?av)8ZwLdtSVbMyx6>iafW&GEfW{@ZKF8!jklQ7vds8fG$ z7>jo?W17G7afG=v?LOY*2&FbHZ7o_=Yu`Os24TrMVH3Vd%m)C0iV#>84BDj z|7zN6M)lr-}c;EmD};ac&^IbKZW(D7LQZoe)Vvu+SDzHV1X- zF*UuT%^f`bNj5Pt0kL78F=S?306R&LB}6>W9lU`10@KqCShF=2A9iE$5@5N~tn_bq zBVYyXZt?^kKEb~1e`B`_`u0l+#`)JMd8oJ}W7fB`eSNF%mM?)R;`O|_1H{NWJ&+hq z(bVdgxWc5D3T}$pP;saoJOPvv?G%UlhVF%e*cgr19kL{8YYu_L(8`7pwa4`Y5n}z$ z`9nYK0((b;X0wUyZolP`KEVRx0ujOx=O}S%dEQ5e3j?CX_rFQE9y5))c+o zMVE)}sUFAYe0%YzT}ucc0|M4jLTT-aV3Mzx_kB&+bYYykH_b)mGy>hSj`F1qK zzcuvLJM8^M90@*O>$*f54tBuV8I=qkw4BvRL2S=zxcSy(yuSXFgN=WlB=kf0V@vGT zrNzlIB@c%j4&$X}=utwB{6lBGtrSyO#m-7*7fx)&SsCcoK^@Q;2YYq!HF^aAYr*4c zQnh~7+AbfpJPlYJen=2Wmrg81^M~p`A@l3nv|oFeAyU{wd))d~>RPqunL`-O&U+1& zR&%0D4-IHF*q|80j|t8)+|5Q%$DM?>gEy+pw{8x$lkoipM0usreD3zrubsul!~z4! z$N!o^c=9l#Wk1L*rDAcL>-fF1b%}&_I?7iS4u^B)c;A=~_yfuu;a2 zEbB38hRSt3Q@u<`*k9VeJ{ogOte(3nIN!w=U@3eU`0?CfXSm5e2XHgh%~4ga03r5s zmZbiOKScp|ku<{#@6Zm&Vw8?lxwp?}R4^B1GM%(!OBo?DKbo7is86nQfS{9=mWH2` zAC?@!hO_+s+c%k>;vzvO)f6@VeN;q)K(A%JegA-R+cs(Fd%uuRr;VFeJX_}>CFd_i zMP$Y#rMr`V76~>zxD0?H-W8Dp2i~gNZIK5QoXccJ9BOf{TR*AaW;|hs`A+@G2jzHc zC7t*9ThX#C6u5srOU^u`C!xs ziHX#ro9oP>W~K#jX9@r}j*piVwr9(JoDnepCK!JoDEG{p(wr1VDRqCJQx0D0TVQRr zY%Y*EY*wUoIh5ndVQhANrSksc_kLuEi!P7@xT=6t1>Kz-#aGNN zM!rL^E9e6P&b`0gT0I4{SB#;$r(Vk(QHk(NBG^rf7K#4=4*K=}UL7s-=>u53-}qTg zgXOCF*uozuvzt?_+Ez*1!%KQ-3r-1Xc6?C(v*srx&4#Hx>U$fV1LD#B zJvSXHJIM!&=jYA-JhWi#MO97zl5W7G1HwjEOY}Oz7{rkq@Ii--V}ST;%hb8FrPoZ9 z-@HC^BX`QOndT?$k8MK$WHN1OpHwh)pSeG%PR_1Al(9kxH+Wyi50IMgc^}p)K@*>w zC0bCy>%X-GE$IvG$G=K}xEB4SlblxhmE3^vDzDzQzj`&OdYyvb6`FYVap1jj}b6u>PcD{ZfY=2$MWuV%bc!xq^n|%T;%f5$?`4$3Ex!L>w_o= zRL%wlCm_~rY%?9kp=2VLi?*51$#0nZOv%2r;twx#FF{g3B|&Pfr>7@Ld3lVYZnxw& zDf@_pxxEJO7#sPS-IJGQKIUgfh*%xCW^A-BJhgO~!haJbkyX|clj!yxcVe&iXII9$ zPwF*$>Wy9%jSkd_BYBcHEHIP|;4+El?zC3N!q18qd%-nw$F_DpnXN=FG9UtHun zKH#Ujimu)i=FME3vM=o!%C%EXexLxV6$8A82J_Xp#DO-{ktd@Og_lZ-eXn?NxgnlB zj4pj*Xy|T~bPTyCHC(Bd)Aeo(n?IV>N=X4RMB$OF2TZP(zrW7NsolmzJ7bRPbnRv0 z$XlTtAS4j82PFI~2?&jQ{>Nl6unr8Wa6i0fUD~^KyBUlS7L~SlsAqQ9^*XewnRlOi zVMf{{P(V38b2J8fSBY+;+`j3D;w|nPT~d8`dl@3P)J^=_Si)Ub#4v5VHO_w_4Mx|< z;%z%M1va~r^Ia1U}%OYJZ&L2Oj*9;S_I>7DN1PjjRL+G+2TO`>2CAL8tXfo z-y4_uePIfdo{f@=Q+Eq-iEB{V?fLA;>Mz7ozW&))gZdy*w3%akdZ7H`x_8-Au{AhC zajy~rw%z;CF@jInHFNepQ2^&oT2z$!HYzz#?i;SM|7=nhv_1R9wb-`oqEO<_DkgXD_@^=45i0Mz*WM!; z(6_+EDT(2e20ycr(kN)1Z8V@)AM3))7QOSxT8%rYl<&Hx`f&rXKa9ieT`>w%MK@Q^ zRXqkwz_OPNp9AJ3K~h(ISJJI^wvRLC6q6!i?{5s5t=?bW#Vka3j&7h<&alssr<@zB zSYBOJFZ0-)$S z;j@NS>Gy@*h@ScZ(Y=MGhlIt7967T0X(blKnmV7*n_ef0Hkn-#$Czw8h#ok5H5LK& zzs<9w{YoMmbeFy~{xDgY6EHUVtn*R#)YAU!?Lx7YV0MTf@0;A_fv3>|DPKVx zG+BURv%W*zNJ=pQsn;O%@nyGniJUOx^I@7=%3XI%J_PmrLqZBovW|vauCk(x1Pw3y z8G$MhFClN_Xhfh@$TZrDO#W_4p+IAYZv8|P@X?kt#VaK8ew*vogU~y zLz$D!cG@l*E+0OM0?7C1S*5;+f1XQrcA6$DeN59u59$>}cd_a;a_oQ{cM_uEQ__0H zWX0fwG%RhJAXuhqIQCakyTHrkra#eQDD9IjX4Bghv`?~6kt!`y2GUG?YewyE?OhSG zB{dKyvpM?vmz4bGwdD{wcT=HD&2J*6_(yqk`2{szH=f5eOEz~00gX4wqny6jE&J=K zE6$d4P{$E(Zf5z>c^qW%vJy4{IvqoRb+JevMH6yxUHcXQ6eMm z{wJYrm^(FQ{U`C2@l^Pz>1t1}fu-AO6b1cIvOV>@IZ9S)Qr{YBufYW2Lc;s~r z_h^CQyt*rYnCQ=%}hh6*0EzJ89ZnQv|+}9Py&|tYMAQyiAU4jCtFOD%B;i2PtJvU=7 zh$yAYIGO%pPLYN9UMxACm3Gfd4=Wu}!E_xg*^sL7sIZ@NZm^_n=nbceHO`-{v6!l= zPyzX}H>kv;h*UQ0jj~Z{=sE&54yrfmwR4*a~*tW>c6sVJS!}m+MwaXY4Ylq^a0gW~(Iai>Yq>ane|zqIcqZrms;)L|WBzU}6I6D};MJ zi&en#a-ucKJLAhVAWpOWT65a_Hj{0YO^c@B^Er|=VGl3oLm`R^54UJo`f}e<70JE1 z5I0r`i3cfeW7t+ypxMjgBsBE=;$nWclW~qKlV{}3Gj#yeuy*UkgdyN)25&llk?iz>eN1VK3<;G~{qx+}L{T>KfL_Yv^Qy;p=c~y+t9XotKwwNgUDtt__2MQWg zi<8Hl!xp|dS3oxKV$mApE7$RJOZGnt7*V1oKlV15W-z^S)}P%~)O#W}QT?PEmrsOi zS3I?05^Juz22ZHQOgjCbZrYjO?!LzBF2-g$5^#?=)YCTGf4wsONgABs%bFp9JCi+J zs4ive;uR)ZNLAyo+F0<{fO=S&piaVYiX|w*uDhucVK!GVVodm}^QH`gV`fi>YT?SJ$ z(#iQ@!G*Uq(6R2z6Q)rA(AjgxcS7y-w-%pjHO*DIcHt}uW;I~?2Vm7Y&*%Q(!%`;t z+ET+<)R8lMU5f)!O2}uJQ0EnPwq8QsYWtG67k``?Aj;vGQI52N66iOrezZJf2bA*D zj812-m3$RC0(9}Q!fIBcUKl>|E~Lt@u#U0HU?J{?T7tI>+nMZ5_d}>xEUt3{k||gA zp+gwa+O?BSA2TR6K}|!eUe^*`h!%XC23J6)HaPFUWfeni3jFv-nht|0*P}Ot4OkIE z{$6vDjk(XVN8E`g%NF0C59y_>h>N?DS8v)(m+$7J&V-J%*;L+IhyFPT$e9sixeOkD z<)X2X9%FJWG^p2k?;K8wYrA&d%jwZ!96(G}_CBZxIa*+Vf;XG?Dr&8+kvk^{c1_wNpOP(WVJdm8l{~OP2uH$4ASU<>r9w&A^t{ zFVrHGt#D!`^mG*ySZo950|vryx(cj7!KJ@<)Kc7`9_V3(!JEqZ1(3CC85e(Zgj_2A z;(JKb6mJA^{#0-ke6^LOCE=l_ot=kv%X-WL$W0b~TYVaWx(()2fW7vmj9k#`X(*FIQ>`ZUxxi%2J^y$?mv}2t%nz51&{*EwJSus1Bz3<)b;RAp#oIvUi zeib>4L!7YrBf2*5pNwbO0D_?znp8+yWA+?TEEo*J;ZflD=dXko&j3KtvD{EAbO{2A z2(cZs59;^GGEH&s?6jOvI9JNe;kgv(lNAG-KO1c&H19L;_GU(Yqm*fD`mM(OCKrhUmag< zzCiE2$-iu7K9zrP+B_(O&UlwH3}CoR&wr~#A5W}hRX@^a>1vKCx?Q>+pq%|bOucnf zl->J1jDd&7~bIuie?~6d%zR}L_O+(eg(0A3(Q)VegS{QMT-62-h&nN{LYFZd2 zth!pv+65BTB!eeM57Ndq&JBIm~t?<&~XDu=^L!t3Mt`0G#6zhF#U%1?P70a3*s(-qwBK#G3uw(e&!-1 zi{zJ;6}1zAXJBZQah~mQQ$`dV?x`0aDXt!5Io)#md7V3$XJjEOItWE0%Jkbr-Di|e z*BJL2&9{tm<1hRu-VT}~AELib%3v$ZNX{P&@i2gspM1$G`(2RoSEyUy+#n5Q8rdTa zOQu_iAEt~&EV#8`5sKmcF(g8#ki$)BcJA-fh+&po_VGJNKgt$D0H@@y@;q_Xm(cjIZy$x zH^L5i!A%eSi!Z`1efbWs=xm*}%mEUFaoQoKohe(A7h*f>9}H`qb@0yEUD5enS4gJf zaVSAf#?^J37pgyO=12Ps{@8}&o|9e|Wf<>U!dTSvaizhxDtmo%H9Ma#Ry?IHJ*F0c zXcV_Z26;NYblp+FB@1MzYiM@tVvj* zh7vf_ZoiKp>2@lhvVI?~a6XwG0?Wcb*ZNTNWCgvhFPr_|^=ML5!~GI=AOw~nsY#&Ha0Y0Nw4*PHm zr!WN>Jc#n&Kglqb*k)Vi>|*|!?C6%^^nJ0+DkVx2nb~^vlF<2m2bnsj3beAw zDrgwmvi1}I=DMu|$h}^SfA8~*Zk#DwFb5e_G0f4Y6lBp(@(5^|ja#Pdesr2|TD{3{ zU4R=mw@DveWlx(M!XZqxsYLZygEbPm350= zfLm}TJgE+F@7^&;a^33!@kpMV?_ZRI_+)nSa;8S|vq#4a9c~ z?q&HHDGzS%oAc?r@-Bf-cQL%L-)MHNF>LOEZkcJ`HW2ep{-#98Y+Z0U-`K|E<9~IQ z?`ow=BQ##f!KIdg}JW0eQZzRtlv9i~DLd3@eH# znir~)R!{hhrE8+Rg4Bda|JDEOi3J;%X-Z!l2y)BM@vvFX^x*G9n)Xn11Uz0YVI{H4 z+JSXUD%m;`zWN6&g-w^|m1Lj{J#`SsHFM?|z0n+xe}A1D`I!&xebC+Y2J{S0GbH{A zlrj9F`%UZNLvphCQz2%vrmw>-+DMt})?_ahP#uU{%G0suEdMJnE?g4x+c>vCO^`G4 zamJhkx?AOUP~tMEFo7jk0`o(SUDifrMdDCpF!HX75GkSBD>GW`BiEQwGs7OavE$gR zEcd#fREcopPZ$DXFVj|HyvqRdJTe<4HxP7B{+41xX&(cu^L#BG*#9gyBKyyUK*2kt z=C6h13d0@ zMMpcTcUGBvbxV1?w&gcHz>&D49=a=VE|-k=J6U}W)$Whh94rPDhakJE4m(7|CZ%t99-n4y_8TP9Fl(l zO(j8kBJP}=y~u6J|6oa@>by7qYbhZ0s0)*zUEoAdE9`{&WO1usPk8+#iFI1*AyC4* z3!KoLD))Sb)c6+E3*ULMNT3DGXl3#mBb?Dqn1~)L%}L0XQez{%*d%hsrjXvOqth}yvmRH2a;!7b4$C6bLejjn~{d`Ncy}QPhcVqg`87 z!gqS9bDa2#_{5Zimk$=-f`&o%{R0n}|1Bw=frJX9-VUMfHbs5&@u=&BP`CKm&h3R6d6 zLc7_$nCs+h$+}*wUbm>V&5q`qhu@s_?(^=?YnW&u-q*Ra`PItRJj~RL*o4xgc)nvy zEZ}{52s@~OiK=k7z~!aqQb1#cZNy>46?&1#bVFSG%>&`t@(qTcxO(YY)0y6k{1%J3 zh)})IBHUf_U*pPKlCl00qGV8 zCS=->lJs+7q{>tGF`?|%07)Ngrx2JuTsfq?f&BNOgsH5zw(<2y3NcXoUho*p(m%UV z?c)JXzYI*)1$hlMH&S<*)X?D(Qj{tRok@wVnSBJP6?G5CyHX0qsmvTznJhF(Joe>P z{#3^E+WjbA^J|waetd*>~vgRCqunIc}43z2@ccQp!F%k18$ zG6aHVv!Ka9uQ2e2CSPz_^BH7xa5{g{p->4y8P{eUC{$D9^G=v)pa~7$3j2L*JIoEc zImWc7sZDX7Lq+$Yn~mS z$Ndi|<3(NMBxU@npVt)BJVe?KRlZaiHgiEWPaaX=Mah9kB!hW7BL#Vv-5SD!P{X&Y z3Jvp(uhN9*0_OMKYK6Iga6Sb6b^hpJ+HuLaVq_B7k5`V(Gqn$wDC2Kd!alZ(P%g2> zACL|2Sl4A)9awa|kd?)M%OzYB`0%72=Y>#@uKXeJc_@rq_$tFNoK}%oNJ-$L?qL2A zd5x$wpVBKoC=Ac~P|#vgKyctdth*!`+BCu;wvZs%^e=D&fFJ%Xmcf9xbH{*Z*&g6& z?bBbK;;;k2RsuK(Wl}IoDh@sgfZOtKP%nz0j4djeRFM5+srOFJ$I6(SYkk4L5B&bl zyg+LnMl-^vu_{QQKy#<-&7kS*2*sHfJCfbnG~=GN$G@azO9;pY{X7u@ggYfd}j2(!}hNdFNr_b510?3@^BuP|{98=TG5ZtPO|dmE`T19Q}nU zBEzYH14wz0O>`TPE_zj28zITGupW|K_FLpGtlLb%Qg$F>;EqgL*JZ~TxmLA=-BU?J zkbv`X1d!%L*lr^vrETXVfjnzfgYmIbv$0r$ZF0-p4u_JTM6nTvrUcG-QltwO!Y)DuJR_gJ2#v$JO3lQOi>_YwK&o z?MG?4CK-T*+;t!VXbr|+6C&_oW?WsgiqE&lpP%pXG3l+W2EA~|RZO5YU~^AaT3ieT zAh9~eAG<=KZl#70HnXZ9*L>vNUk8`rD zG$7kiE@5a*cXx$uzIsp*HDBj7sk#P0KUFDJ;VR5IEM`4j(;pxI@Km5<`0872YMq1S zm69#D4y_FEMuQ5^pv6U^$9CPSJ-Do_tb^@KC8T;^)?fTk{QuZyKrGZnB0w~|`NXqz z4gYN~tCj=s6C2Q*e)ZzF+yc`AygFV|VBX7NPWXtFTZl$kR$$_(4isSj=pl-x?TYV& z4ho)&2`?B_m0a$K=35>by!E<$AIT_Mj7;aO+jpFUM!8ATy&e24rHR0#1ySBPP~v*7 z^|A1^#_&@g%9Hx?E_+j1`mbAC?x}4T>+ibHBTP-hq(-YH=DKfj@L_<^9B=qpmu+{D zu1Mu+kpQp57;0~1lu#RNKU+R-Ry^`hO(t@+X=CY+!)TF~{nGwuokK#9T>Ahq>3@cM zpfKhQ{VV>_WnOCMUfo|Pz4SX{XXmL%AZiF@Q_};;CrLY&0%4=tj9plA`5{9Knn22! zByT>!1UU~^y&E7r)_E#@aZ)et+|a|A!77+vSiT5%MZy$gNa?0|^FRc;Qq}B?St~TE z?Zf21sZ0#IvVyuIs z4GR`?JHGKAYW9~|p)SOrU7)<>E3q+KQp+6ngdAWN{~G{_j&~@?XG&~gc>5ASW}?pf z`TxK*|$=SJ_x7@EzIp7Ju`p`M?nR`q5TE14fb^AFb17%?wIv7p3PO9L3lFhW9 z3ZI!_%8PX45l0#(PNe)*yY)L%Ui-VQNU=|fZQ+>v?l>RzG5`qWIC*B)b#mFMQ{;$N zXrUa^K53GBi?!7NdLv}&zYl?T(}fU7J)(alCCyf7*K@klN5;yTZ!s>_7!M6EBSR7} zY!zvZ-ZFbT_~F5G9T*Dr*l^43?3{KZDp7H%bt$m`LTYY4PPQ`3aostk zX(pSzwB?zBV03lbL_hg60Er%v&8}P6=H5{qs-kbwH&c7kY;6DM-Nn(@CmHqe$ii+s zDjJxX)~1(AUe+&LG-R73#*`xr3LJf5MI;G~<}vxE92~@2QeG+l-~Kp=Im?%9ngmz_ zAIDY;I4X13L!06{Ryb4LoLw%t8{fqJGNa9MUkS~e!hcXq&#w~<022=eVjuJb6zH|( zp{*YPw;8Q8!}v>2f2DuDhgSxE&6}fF2|Gz(&gmN8i2CZH{}wo_ zTrl01vO`J_sLRg0=n=94Y1GJJL6H~F6la#fGxnQ!d`zgk26<(#+gTJV6JPLQ&stJ8 z-0obRT8>r8D++3VFi3(vL*26=u?y0RK4gh_T%9xun9-*|pPUzO+KH%nEeqSe{Jr=s zpcrl5qe?z^R^u>c`9=@IEY0pE1^Nf2Ci=ET3k&2D)Ye=CdY{RFdjLqQL~|bAl_}{i!P6!{4sSNOCx-puj80!mAo9!M`nfv&6=WYaewWWIaRpCAki7D~ zH}Uuq9-)vfaHi~^?O7oo?m>vTUmU)sEx(Ir^r2<+>%8C2%0HK9srz-e~%Z)n_?nJI(yZ$_EVKU&y-^`2Li9DYvsLcS;;6! zW0Up?VqOR<5(Ef$SrbpT@rlHS<;SwJ9Mi=s6*!`mDf+cX*aBB+ax4cbLkNM2UBr87+=EVS(2pJskRYo9!&!93%uV7@+Z-{Z_n<>DsDhTHTP@}*L;Y%fAW8D zUrRpLo&vVvF~tVXm`zdz^xw)Ak}ak~|N8G_J#$YkR{C{^_Q+)8SG84%U9or_>E5*L zwP^Z}aG;%)DoA|%$2|kLrre`O*~MCvymSgum2{>CirR2y@5yT>{C2@E{02B{5Cpus zdtWXM30#KOzpfs=bG4F|_Q(c!fpQe;r7`5ANxZ}$?S?HOCTKRkMw5sh7 z(lr06_38EhU7a$4g7h|92GtpLPX)L(M@p8&&)1G5RR3_j1{?IRMy5f@G(P9;6p$uS<|GmMcoN}R1 z=+sbTTX#348@67985zgWG~p?0IBuQBkH@W6rc#iQ8#NI(9473ezB;toUSo9e(k0f z$#nh4RN?-qY;08Ff9^8g-RR5d@foPeS)=PlfN#4?a1il9P5l3=DbWwBJD(u+g^!F=Ixmk)ztVf^2$0Ey>l4M6^&I8TeD==^Kw4}4CWmh;1$ zck4fFhSq;rkN~IL*G7P@5;RrypnmQ~Owwm3g3kCNZ2pe5SfPJzShTLmKjEnWN!lxW zC7`A_bCXh1)>4Y7Vh`@0)9mHu$9KwNpBAS4pF_K)xTA}~)xq{WITb}^^krvWpn%66 zxQ&NBF!0v{#4eQ9h}Iuk#3noSN;X&*ws_xD`j~zf|71HC^0`BAlSCxIh4dHU@J1o+ z5iN%z0BF~{PA8{%E*X-@QwS~YKWTnw6J|gzxCA*S|L^sT6Q0y~5AXZ2t8~Fd_P>l_ zCxcyrUBar~;U4Gnhc7q28G%Oq)BqR|h42HPH?p-~Gt9bt-VmC^7`Qr^UC1y0a*B7C zSn862%ihWP$wDbt`Sgj&sX9a~^!`$&k5jiEM@y;!%Oa2gKOf57b>x z8-Cul-V6$w-I#snDFnlJuPd4wlJ9C>A{Q*EeJoI5AX)Z}fnUN^P`rfe?gll$#A+3h z(|%j3Xr^se$Y28%Y!ku^ibyf{iZtAp7DCV?MG{BWqTnLX! zOMMQZ^lb`k5{gU*jfPBt8EPlM)*eB&h>V1~rZ7uobKg5MAonb@{UQO@1Ohy&fpvNx zkNs(7Z#)LufUA7eF|z+oAm#2)k|_eg58diXsi*L+@DvkIyS;!&{jQ+hB+Z~)tm(4v z)N$9Qxkb+@J1L$)%`lIyyX};M_p=-W`HAo#Te6`wits6>2Prcthg>UP{DLE6s6-jq z!f&mJ{t2?6#n0@-e6yejzIBAB^h*j^#Q?Kp$>QUm5jWMO9tCP^rfAkqjDy2CcvbF{ zv<>eR(&^1Nt;mN#|Ff{q8D1i(a$}@cvI8|P@f0;wTS8ux&tZ5H8m(DAZ?cPI$yaXs zl}&y}-yzM=HG!S?1i|o!)x8>5+6OYIfnRYys#X}}seDlVi0V!tODG3a&s1NX0(wz{ z2izNbhPn^`PlrYw5O-A}gvlc749rN>!z%uY7p?@bgCF9xtW1Q+l`2Y{SNnPYZIpeE zN=SzO9k?UNWzxjvHA5CGJ&;6@K?=OgUvJwp*NN`P!g}ujEtp z=cfT^!M}mXgjNp)BkkAO2+rg8qctN)fk@*o1&qbadW)RsKq*ey1QP|tqq@y&ZkC0+ z7gQB`6uzE3^>+Y}e|@fF!jRHG5G0My9o>`k^nV_*_2(3mdg|ZmQV}ybKv@CrFA-2! z^^^#fCQ4(mYPly#^jY@sCyMjD`uEqF4pJ*$$=^dkyrg(44`6v_=?d2l_~ZZQfZ(Ov zhTrs!0oC7Uk5ayM1|#nR(d!f7Tf_aX7p(U(2{YIX&$V2wQwhNL@KQ)jIK8#=Z&{ke zZn8^-CgT4iZUbZZczCbS3Qs28+V%(a-*6UK7hZ$4poQAQgHV3OK~4bO?dU^^m-5U+ zjeVKsn7rm+EBgQb5cm>E`GpZNnMX4OFh_&@CP3p$l*Iox?SE4T4L^2J0oYpEMJ-%e z3jZ`h{(nO-h~?FXT5(P>!FPPj14V=Qc-F73bO)Qtr=ESA5e-jhgxmWY8@O2RJzsZj zOMC>&W^k&u=n9hQQ-Sr9-*`Kf=cYUyJ5nSo2yQ( zfH?|pUBG#SlQy?Sq+{y%O9JQuTzZ{PvW`_S;CF96qW+sBb9G0(If^KMbWDQ8graYM zLZxffn`C{)vaCZ)`X{537Wc!_$mtLMn4s}g6(eUE?}tN4nBX@yOKX`+Y%$k%{=V`j zlR?dVO`VgCmNCNn7m1T7p)aPzIKCMRp%2TqE$-ELXiqj~Uu)L0#o!^pnMKM0uNzjw zgMX=U2=KIc)Jrr5Uw6}kuhAqHBr&Yl1qkRKt!ijTCsZ?8X9rr}E2H<0xiS_PR;t{vPfG+XT;WFXMP>NRNS$G+CNa)Qfs4vtiVhcFX(LfNNixz0twbM0f>x3Np$ zJW&zLe8jhC^)UDKloL&z*y@}O#@OuaSR#>P0>6@{qBviC+LSM?WOd`d*t+|g=2frk za$`n*UwdyN4e3ntD%_7xOD6Nr7%oXpZBo8j){Bhi6~9;B{PTJ$kJo7K2x1r^TK-cvoEUi~d6Ve{l>WC54+UH7_tx5iKxij4l(vk$ea zjr!Wb${V!}%e$53eK3XFcv?cn6m`uq_-;tgPM0=oL1A*eNFevpUEqz?&ZKB!k5$ns z^Qn+4W(yM7t$!SSgBq{_r^eyVLjwNvc1@TdC~OpVNl&5eykq9hP$?yY?GSE%r8aet z{feEaLVrcMQFTUi)csw`!_tY(8iaAqUryY&h`XyY0yp~C^7Q3uv| zZyx`g74)x@{@}<$VBt7P#M1KHY9j>Hv-kvyY)3Vl_%59f-<#8A4>kAxljQij2bF#= z?O`I8U2t^%g8Hb?t!k=`#iU5$USL=*HezSyasMid!<)2Y<@)+R^;ibByT5-jgFgqd z52#SnmJYguV$ZzRLXzyRgepM=4A%GgRH8-LH=DN2zxi@y)u!8lSU(u+tvLf=*N^j~ z##>hXRcx-QU*4Xl?Acr-jnloS&XKs+&~V!t;zn^%&9wi%to4>AsTNHSv*N_fwu(~< zr5-PQg(0f*EkX5$-lHyfhbtJvz-)R}i1 zwHSb=0-p{&9&t>+5>*jXujK`az7-b*Ei61vuuz2-2tCw*y|cqGwUkTBXzH8%Oft*H zPzWPxTRwL0f4*f<%3;_4-7WuCO4?65UR}Cp+4P^O`1#p&Z3WJ6Sxh;*#=?AWQL|h+ z!8rzY8`N90*2A7XxDTtQ zP46Jn>EjMESdWXe#-BJG?XVy`4hI_AAxuo>&S~&u{tidk{QPGDz!=6(Px9m~-RLj! z^k$G*>||rLUNwaC*{XW9&n?Cf8OyXk|8qzWX^jSgf`2ZMJ89ivQp%pgWAHlofIzT^+#f%0Y?T$nZ8mzd!CYW%KaTx zp;+2jie}8HmCBDL?>0g;s~Jh&;F7}6YXul2wb*NZm$znHM^EPh42#*_JOUin-@|uc zEzq-VRbfKbr*+uUs=^0M3+B61l;BnsFVdHu_;&PoVgG$IbWo6wN-IuD= zGLYG=V|RQ#9nK`ywk9s1B;wiDFDRMT&Uljqhc z_I*AcVW(W`7>n06Tf%jb#3PZUl7SPRL@f>EIdm!RBbw_}TXP}x9)0-oyY@VPdC~)>; zuYvm-W}jf#Gw9>E*>ugi&a9eP9^jbRIJLW4z&$wsSubOKA-k**Ci(d6H}^4>6Qk3w zo0V@W;ePXUeb@U{YEA5z&9=hr#gpG;FxcYOzEgUVm`L@hTdntZljjpAXoE?Q#>w;x zKkftr&E;NS_qtiyaTgc@qj`iYT>!Kv@lBz}Lkdwj<`BcIRjx?H0?Vb#d?5pSZ_@#ru4e>qWSLh{1|MQ{8 zhL?=K$eFCZ;%`{0t1&Pii+|zRP^8;cv+X%Na_0;7s+Oo(%AQ-5p`xr0Ty@ZWwSS#^ z#1Mu%jMk@NOgy(eeyTiPuSm0EbcQpBd@j2>kj6v?WHa*yRx3`zuZ}unYi#}yH~F>` zG5Rqm0M+Y-l>!WXN zmsL6rR#9XoVeP@fxKs24zuJ!lF}t(4nQtOszZa?tDe?8Ge)cL?{OcuYp-`~rY~t2+ zb@kcl`q&nzXl^2!U_edBen&}hdun$8=Z7qFAziLc$BF%N-;9d41oTMm4IyHE=);8@ z#A>D%o}|Vb3{1B4rd{pCS9}okwh1)a{jFeGAyHc6w>cN@Ih|~PTogRv4mBKBL@Eu< zpjOR=ds;WI^+)<6KbIS?8jRbCU~^o>^muii2${yC@vDMVNV`)Ljuj|?cEtf{X|p=k zOT4VKofjvko;*5YaV}U4okiVVgZa=^jO3$*)S;)B=V_!*7q^n8-|MV|4)wd<`f=06 z$Pjy$8g{~P(Mw-#VuCH7@c4*c>2;1N8}WnCBOyvNU$!JW`)M`Yr4f>0@Kw9au-zbS z?7>j@K3u?RlBwS@ejK9ex6_vDPoL*v!L^Hf>h5=r#4mSe^m6V?x${LZ>Ew6-Wr3k& zGxo3R5#coNw)76ym=}-q%Y-X)=+%)pm^4r6Rwyl~O3)PvYrIiJo|YeWCY>~GW2;L$u4H$QI&^cd zRQhcgXJG;EG@`&LRGyaBz;wiuv4~5ItBW(mdR0- z`iqW#KXjnkh=`~P$}12tPfuCx4&-e@NlrgF%Y9awe$6{IIkh{aEJ(*>!WrphTh#JN zWu_j#k++LRZH+maIu}p4D3UaVn98aSk*Ssj2e_@fbl-74GHU(}P0o zm9SnXQ&+izoL3l(f{F0pVp><>v>Uo2!~&mL=_@z4pZJVX0u8;CfO$a=tz^#VbY?C{ zXkeFylsMHvfaMc==t8B`kY>VT(SW+C)NoD?`%Tb`m8oz&%5}=({g$=rnsa#F`Qunu zA#+slO1^;nx?N0Si-7BN+R?P1qKdwAO3^Dsd@{O}%_pG4zFBm2E`{GP#d~W0fJW2= zzIBQkVxXsO)}%U-bNuCl3+M~)0|o=IbW>pyWx+@$7fiEF% z6>+lW^$mdhPBO*gFKWe&tB!eBZ(5*Rr;3Xe0ncIZt*Gu1$+}jTr7}q0yvKY`#r-k! z70oFy)^tvKUkf$0<<5iF!-;!H>g(Z#1ji?3ufuL_77-9zjxr zn#yf@2*fS{n<4GLzkpud+o-x3QXGO9Ia_Z#XEJ@dl7cvIftMtyDbM!5W9g5*rTuxY z4`a0(tA{YUdoBanc2<^E_={JZbLn%KcCxcJJ-+2#fSPU)Cpaj4^94OgFV=KjYeVE2 zWAFoV5Q(L7oM}D4g=jCnjftKWu=*oaUx`VFfR7egE+fHfZJEb^wNKH1x?8dNNuBS@ z+e*ghFOL!kSWsIG$(TQ4-V2$!`I7Tj&7qZ~H$8j4c9XjPDf;Fu;}ZQ=DVf~WQgn|G zqj@|uObE;D2EOitB|MKq#vP>Eg7fA=qE&0oVvu^ukc!gRoTA@q4FFf4l&071Vx8# zUriY#rn#wKI1b+-V%>B@QXRB${@E(awp;fj2ZKyZeF6TM^lfmB37Y&10vX`S}m^G?gj_G~DtSAfUpMV-P<5wGU{$Y!*?kHc&(_}F(RH`7y3 zfOTUrnalm&={n1N(t@`KN|!*|-~7^obN>coH`mg-EOz$xvKv)x!Jql5&ox|_=ez&M z{Jo5}8&+^YoV4BzyKcFjA2-bEI9`~xD2 zlk(Yskc=B^*nxAOW}FbesBIzk6Ft7?9yE{ExMU{P!!P;ubCy>7H*zSeEyQto#~?nc zxIVYj@{`wvsJZKW>K<8~3Pe#Sd@gmu6S!W-=z#0>XDq7r^J(TI>4mJZ)HsH+YZR9@ z(21r(sv8iP&A&{}DR}SyExB*8{Hb2^>8^ad7L!=pBJOv=IAX0?iKTi!X$Z3y?LFW~YAMZAIt5|J(!|on3ur z6&Y-lv_znoEEI+4afnr_NOADtRzRAseJaX46Xk_l9mzG}Vj~jn!g8+%^IEq#4J9&@ zhVHBG`zN7sDIxrc9uSUz^9_LH&qb1Y>UgUxwRt(ex7hQ>?<|d&xHX2u*{>q&#&}=* z`>)f!w->Iv28Zq7a?d<$0Qr?GIuAd-%;;4pd{>SIk+hUhf2}mNnAX;c_y|hV{I)jz zBX7_tXA3Uk1{d$9qH5w(0dEg8Bq>4YBa|@5+X8Hb&$^o>5jY@1Ed3y|*zjIC6`$$&w1mOr8h& zMt^Wz9E+YVD*6&Vt~0H zjl}r7+ff=Z-0N1%BEIruN*?P_?uhE|f?LLn{9Ui5*hQpA9+Rausr*7`A7Tp3l(+7V z$t?(U<43N3&?1o^bOkFDp3EovR-5%$kQ%Odm^HtA{nF1?sol2PedR8%2n>`zl>CuC z9q77YHqMzDN8yf1+X3`{q5yEG+eGOIxT(LTx}2? z+F^r2eH#AL2JK-sMMb$|Rxu2TS;`3GN6gpPb@v9aU4;8xR`1%B$ST@i3zhr5egED> zXpV;2xVZ4d@j6Tud$Qa`c3$@G`lUFrI5=Gv@=>Y{vCueZ1)xDEcz*U`LJE@XRP*UlVfNUB(#nxaKU zo-z8B^2*$=dV9amlXix!^q>2DuOmySTGIugtXFosK58FQ$Z)6*f<2==WrL$npGY_9 zjof?UVxsh~OMAK*RUIL_XMH^kxEMpekF#~#0$VXQ7cm5FUXW3aB}GCJ`0+tx6Dx+Q zYjM1psBEFm7%5AJJ4FQ9jr~mWmLzeP+FBY^dXO2NQliXbLJ6yJ`M@e^*Wi*w4PsEK z^)4BWP7q)MZzQ%ziJP{JlsRX_85@x63Uj{jcwfn1Pbk*pdU%JA$X%e`?`ZA&YZmfb zar<7}(kEviTyX3@jmf~}->({f_WBKp_Z+D+5BkI2EOhu%ema+A#F&X+3=t(#apJJW z=Z?${yaK=$mzDLqT1yU>J{Fn&JQ5P4lEWmWfMM#$u`DR>t==swbh&0>dhMqwtFHny z*{~29Bt>t~;M9<5O1OlV)xlc+zhmyTF_*QRtX3cqejS4HMH{+7FI>?XA_8u~lDotZ2pK z<+~7MSIsJ&)+Cw5TakNeF}4TJxf)Xx)r{`jZ^QJ?xZX@&4QJ1);uB3DA~h|`NYXKj z;xTG#;M%Qbf2LJ6cu$e|QX+tpggi0lHrKo9seLh*!iz;mwd+@B8crRc<9UZ>um4t! z+x+1vJunIw$nA^SSGyWGznWUeX#9Be%;r_yz_7H`#fB4afq%K=<}TKUsSeHHuYY!~ zc#}bk@b#hht%x3@i@dZL^e)N!`Vi1$H&?k=~NSI{_KE2FOO zotoHM3H$mY>pe4@eRi);KP3u(kek5nb(!+!z40n|>lkRzw^(DYV80gLIjZyiDr}1( z^JRmg*WgOfyz?aCnb*|>tk`=;Wy>9gXa~!d*tErCy5sM zLKgUa=63W#E-0>y0kPW%PqXU0G{LSG?h|%?)~OOT4Q}h@kg+^P56S zkvGnnqon&{FE-a%PuiY3Hu$HkGQ!WjCKJ>gym$kOtCn68!aH8!VlOgV=sj!e9=rdl>FgsDCAQ#oI zvwqws`faA`^P=HV(_b)^Uu26SRs zI}=$|k~K-gv}OT|n_=2sb=PTBpPplXfZ&Yo((EfvfmPF7bU5Gl$S5yXNzeKtb@kY@ z8fH%=pHVi()Rs7Q8wHn(_%}|hx5Elr5(dTgWc~K`-!x_aQS?mgySP= zH2fTW?aV7Fb>w0Z*6cB*YWPFAOmMj|5cZfnSzg+nb7VpIh-LY8r1Jh8RCU)Igy!j& zkM*C)G})^9xYtz@-(#Sw;u3?FlgTBP6CH+hY2QA@rU(slm<~hJrkD9(4c$rWxNJIf=dCNY%Sm(U zy!G$wbiv|9v#(`;P|sV$bXtb?cp$45J@RI1C0!uXr++QqjP{T8$>`du-F0-Ku~82O z&Ag_MJ1wpEa?qNmzbnXNq~I@d9YtR__-5)viNPL z?Z|cAwDYyYmD*8rFcsF{KpF-fZ5nX$K01M%Vy%w&VbTwrP77Ks@)*;{xwB}Hy$J7ncxsFYT0~NPzpX zY4*bG*m^tKYR%sDep~P%59`}aP*=GYDy_aaCka)27W0)FWu7V`!1*&wZN|p?W&+*Q zlJXC<6MW$L_gcZ3vRz5kW5SBYqLL<4^WiLZ_^V#+nwEaGYPjHm3JE7_VYND&l&B1 zuu8xlgIL-m)<^o=cpoq<^%RSfeDyCiRa+LT*vf>tdA}Ajl6c>1%s9ZfO1me(&&&wF zn33coyKc2?K41qq8J?iF?zQggCYfCOYzf|x<1u~_2k+1HDLd+2*IWEXrFP?dWGp2n zQsq(qrpM&)@(GfW{A_y$^uFU2<&}Of}pMh)cFQdmsHN0_`84NOr(a98+jT zwBL(-2Twr`kn*Hbul~%%C}@pREoCPy&1r4xCbP2(OiX>&4c;7K(yU){{@wFVxosND zi(}Y?q5m2iPX11lqMud;(;F>}379)I{HYe-+iUSFT>-79S$T8#l<|Mhw^S8f$GutD7AnhG>F zFnzyF?xnpP`smcONb4qoHM>uuW&i?Y^`X8SHL#)gkN`_JueMWN<3P@kE0?kpH|CXA(3>phXT7>6QEb=p&OdnT^u>K4~vHf>Dl zxjoSb8LYedb`_`Z(qryjSzfg?E6K@g#-upV-t9Sod~gip447>&n4l%9s790 z>P+3auc=AA{M!>et+*&W?uz{rwVWvKR82KUcSOG9k<0L2Sqygs2H)8Cd@*0#r(WJ{ z*o}O;V7y$rU;#ZNuBU0^nx1@C076wy*Nx|YdGocTO zykwuBc#h@yi^z8kCY6nY`6WR=_-Zvs*#Hd2vFhBnPEV4vC7R8r!W;)z;K{w?`Z=Ag zJrbBjMVaw=VY9oFW6ADH%E;N(MWb5qrlj{so1+(GB$88jSJVruH};yh(6qLA@m+7j zT9E%b^WA~#Bwfq1xort{BL^WQmd4YhmByUiz&$+5+PGE8v~5(}kc9TQXf7TpxG z-5Ys|Co_G|7%dr6lIj~xN%+Co)=eL~;e~_9{5U>GeVcXo%8mu0^Xi9LNl1av#1oAH zhY3WdQJiKvb5b3tkm@_F0M61FZ=lil8a}OR%La5&q;K1Tj#h7%ri(Mvmt_ zr{k2{79nA2(Mb^D-l=!wKMJx3bZV(xPvc}7(s-n$lw|wn^I4V+j`}i7LOKPHH0!2O zDlK9S<702kinqQiP8%5k>996Qm<()wq3%UL98Ab~GM|s#KsC~#3va;D*ygrK z#EX@P4|U<-}5_(H1ZLysJT_nD0tO9=mHMYV~5J^Jt9iIyfvc=5oi=l$DnJ$tK%voSDMPQ9pYY=P%lqI*1`;>%vvOlh&4$MY!mWBvQ``0M)<_e z9BJK_`@0x>M`zI99yegNvSoh!B%{{!w9S<=33MXs*1-wL>)r^*;!S)uTvHk%06wt|Us+%flA-FMG1U9+6e@%y#>7@1cXn+exrw*4%f%NL`uatozm)6s{VXuOq%8f zzl2#{fw__6>6QGt($l>H7Pot~lc|*QgY^lc+KyJLV!O*8&OtPu`licIy|K@$#Jf`( zqXU9W^@7aA%$L|MB;*D{oU0Q7WaNg(R&Du_Omhjr7Ts1qqn&CKkb!aDf_PYhr#yDZ zDpGGMF8-r47owN6^w2yaLg@tjSaE-u(|{eGzL|OB;2^yd{qBQK@sbOPM{n*e)WbgZ z`cg=%1+{xr!-6USq~87-;&GXnqMURyaVq)g|Imbo%sgDAJ8={g)gUlYy${K}STk6`s1_m3&>3r0J zxCr7g7_ivRZMuNHl(#X?!@!C*Kd$7{8|p=*pbBz>Iw$eh3uPB1_AE())yNyVv)+~@ z^(_=mX>2}o=1(9BYu9 zvUD_L#T}KII^1v-jP+8jEXi5VpA-2ov8JGcdQ-DD7B`erJ#OaM6VQyZMH$;e>VWg` z{YM0^bNC{{LXu+>fb0e(H~=%#$`jF#B@M#yLxO~tR>>SLVjhR}EV=5f%lMgpO7geKZ zn4H1|dJ|GP;WiV259JfzCDg^cjdv;<6CYu-CZW1zv1TOu;f5T_9(4(y=esm*bzZp^ zPF@?!?&k0xz4r-HkzMVie16h5Q;;ayP9@?pOWW)6wF2y` zQ@+(1%E&sE%Kd-fG=MKY`U5S zS{J+ENvk(*$iT*{YL=vq)a}bX0IMxR6drAips$)UoV%^=H3H06vFm$e-UesSmUe~O z4Q86iom7S1`)*cpAWvVPUUj<{mG!4x5zSj=RZQGA#cTX9Uwvw_80ttaw}_{=o+9BV zPD6azeBu=(z?7nPWADPSX)4DQoI z`Hw1t>&i{$tNBrR_k9P}Gu+u*_2L-oCn8rQoi0dQTrY_Msj&*+-j5>Df;G{FlW-Dz z_LFOR!Rr|q*NyAx!PykbSt_ndc^X>5%v_J0&)06-Z&%!SW`DF5KB4V@G9y!b?=o;i z;vJOo6$%y-S6U^w41Rhwt(SHNp^|s-Y;PnupbZyIOZ2|8$BD<_M@V% z8n30VPO}?em1aMp@GOY_)g+Ox13he^FRrYcYM_p;)${U!q@*xav$l6p z&(4#_iIKin0-l@L_zE8MQ>Oe|H}n><2V92(!~y#^EP3@ls%sVvYytsUZKWSIrx;&A z>rg|#;aS+Ow`cjA$-j3W%^xas<#j#TY2bl(t;-F!hj?26X5m>nE#-WrI$pvz`bxps zv$m5!o?Tc};`-h!)D-VO}B&%&BaEA{_Z7BMI6d&tRU9YQj z-yopZn<+!v>yHZGh|X*7Y3a$oru+bphCG_(7SJr zcYB?6n7np2OV-RZ)4zGx>;)L$w?)TqRGyK! zJX4XTvC(lUad(Y)YUxQ~l6!ivbx;2uZdlD`lx4fV+}zF+mBJF&Kmr2#%Yw~vd}x8r z(Fn}pQICk)FymEXevGZLb#0v%{;FSyFQVeW%3iq6UiJcDq)8)Pd)Vf`)=IROy1TK! zL(OEDqv}4dC?7ZDix4-&`*6>Nw~GSL#(TK#<0iWdKgcu^+WrSC^FI%d&?&Aw{XRuy?(%APJKiId2ml?+yV=m?ff* zHyN0eSpbKfSpF$QD;q2b2@ZAu1 z-CPi#Zq@rg?dGdk8T`^}Z5oJS)>9D+#+2uA`rD_=kb3ltAxjamjP|0rGY5>$K#e0~ z^u+GKDiN*S!hMTTd8<1BL_bH-lD06Y>|&q>9f>82G17 zBWz2Q;px(b7xf=duy5Q(;WGeU&#|;hjXA~LWoJYvIi-RJGM`}{&k>A$T$Ex)_c5yA zg^eaBMc~YIv0QRPeDi6YaoVea*@<;rCb&QrDTnTw-DBPjH+P<+A4?G^Z5ZoD7sR?n zrd#+LV`YGQdh&c2C9Cn##=OKzf)R$9s*T$%G}sRmednudw>2GN+Y7fX6V+fMibM>j zZ{)(c^=$cMb_gUR3Hw8F5%u?~E9Y2x>jp5ZzHygx4fkifD6)taaX`Am+&>J*OA!%+ zeFO5EAcqE3B}vVv(fh;;X`VOfk#fnmMMfM_9Z`sLXjl%HDkrpYRPCadYU_3$VW)QI8Dx@@#VRpn znfbu=Mjt=wn{f$xN5jr9s=Q*SPnWu6VupHJ9290VVdCS!JFJ_T7Eoi_qy(K0ZybA$ zwK?Dvo4=FQVa}sX)Nk>DzQU1prADduiwiR1)VpincRuJ0Ziva-2d}fE>fA0|Pn=`A z+%;&l&g*eVc84MMP#a#PyKysC(r&xPPnD0NSt)lJDy{hKdLW+97G#gs?Ciy#Vu*;6 z9o}YQ-C1#EZvkmWR3n>e)BBvyvl*4CJS5*XT-#$jTVH8=@-w96!nSkwRyg)#iG}I* z`~eQ37e(DUjANWUExaUgrE%AqIhGB9l0C!A#|BWIhK(2MM6DZjN7>JyZzJp?BLLNf zJe!VVPCZT2-mf$N^ldc8LBvNsGF(lo?p55L>`oo=d%)l#oiWwZhJXiUE?w$ z^7DG9U`TsJN)h}m&xZ!IP|$X*dUScXQx_I_)1=WPf4H7lf3ce^9-+F?cE0T#P(`gj z3|c3!=yuz&uCCl6kZW}sq?Q1v8-0U{Oj7+8)X-2hY-ipq6XL7iFG!$%h zY=>)n8C zumShcnQ4INkJrpG1s?)&ZOMyJ4So`xgfKp&ev=Q>#>^`WSu|+A5Wnl8btysZwA91a zYx$ge_T|Q}vSEi+JVh#dYqi;RJVkxraUi8rD&V)G=sQnUDES`iZ`0-v<}RNWkuCbi ze~$ZUH)+m<6E&mY=L_FljWTGQA%aGYyWLfuMyIGzFJfp4TN-=m#$nbUt>M5)FeAc@ z;ZX2o;d*-CqNwF|0~8rAZ!#Qc#V$9$kjuLi?0-gHfF!cKZuMBZ&R>j)8+kZ77R3Hh zL6TW=Y(Co$EWh=wJ^OhuNy*gHx01BsGsKoIyn-ZUU?Pj{Y95WdHTaD%u?Ux_## zxC~3mTbT*Ka5;vhc~^H<+sH?b2XQEFH-zO%dEO;-Pi+Q${{6$FXN@d;AWRn9g?0@; zKvDOaO}k3QEK5MQl-V$@aSe};4=X$Kz@#F$w2{l;wDc;Vy~?AG0aoVUBj9b1u&lIjFC^`t}Sv>tGC5!;C}<#3<3x?WT| z;93^vZ|ry)-!R8Tssk{qiji|DR_}06-9r6mWK95mbklEWZJ*0xeb8duPwuV1vklhG zpw{qDEJ{28xxaNo4v!Bhh?U++bR|px^=Cgm^er;w3!Ax*5{B9ZEV|lCA2TGl^ z=1m~(xA=aLkP_=IO9;BWR)9`d2aTO(7S_Y-KDzNDvvZDxUs&6Vgr=@`bLd@iH&Alm7j0Inhh2w4d}r~18ZD? z?&U9fEhZFD*PW7868g?Qi0uBu!!ycxF8kS`W`ir>NCWvBUcL-b&-0^>cy^J-{iCad zW)YFFp|kL%p625FOLDGeI~Vdhq`?*3QZUC!uj{=_lRO{u z(r%VFe;;3Ce|wtBR{yf5aqP15;1Dj9I3!QOu~df(7kLchYUnj(bSf8cv=XxKfk9hi zDAyD+E@k&ST#J^jxS8gOxj9QUhO7?V5i;M(a2l9JjvA9>nzfP)dimrX8m$LpKVg;G}QWczKeHotDXgJ z$%&nlUTtU-nen+6(hq*@*KsE5xpFwmXm3-s$D!A7%DisfrqN0jG(F#>Tb{Ij)Y}Yi z_x9=RWjFAUe#fJJAvv7fGnKv=KbW{>%{0hc0N~q6hpcN<3a`|JihCjE7uHNNaF8y$(42^<3$EvO#nq`__ zd4vT3_*2cx?Ug!po=~i|2Mb6?>1gBv=T6yPP3%TIg*YpuYHk%sy?fixMBZOAGGCpG zqPUVx9Y{ayxLOOKBo)%25q+>%})Ft{3_ewlfI|j<4HJHogo-5?WNK zBS&hZFzrg9NS6oaZ)ImZ4>M4H125eKJN^u`?w(Vf#A9Onf2%TW7zif_N_>&hS)Bm6}dIL>C8VrImt%~Eama{l<)eM1(nt9Z4Iy|qGEmhG1w{G@+zl#{qs_}X=Mp)5RV!dr` zG;6atbgdHD;8?d|M$T(xw{Y)JiQ(i?olV6T<(G)>y+K?J8dL5D!?~ECX==7yQ`@=Pf~Q2c-flMI^KK})^^bcx`7kOAgN%SZ#axSM1zi@`hn>Ph%}7XsgXg~DiRz)& zoI9U*Z3|D>8i^@|I|8w?N;pgJ4SG3jqc^#PuUuVOvX#n}PVB7Iw6Im1?SB0Ws@FHv z_FC9|TO6irO1O>na&r4?GnCo;Xx)Aqkm&bO;!Lk*FLwu=c+IpEAE{DZQ8rbVm0kC= z3S8YnZ-Ja$$hy)AeM0LBFx7dUd%GifwdlDpeTc);+v1 zHP4Y~BG|w>oohJz+$s|^BVeEZov?o6&xmH>>AEd6U4-5I^EnD?wo{tIZ8Dr->niH5 zpW!B`UM=JuXQ(CEo*bDp#xb+fXfCCuC%gl; zoZ24VqF$HN+Wt9n!{h6t2~1!oc3tpQn>Mhy`h_0tjVNyVUc?X!eDEfk5#As{CD!rVook;j8#FRC*L_Ga5 zov&Ll{8ysKm>j~9qvZ?@<<1uo z($H_U`WM83X%>X>1k?IWZWfeAyBU(^^;;?lh!Meh=^161?&9IF%VF+tr7{h(jVL^o zjw;$w3eZqJcxVet&KWL_waZ|bRc@D)V-sw>v~7fXx{)jAmYiC6yCh8?Obl^gcFjCP zt+tP>PM3lA-?Ciu`9Gog=O>}^( zaBJw&_=vt9p!N__ZCN9SvYBAe3Tsl+EbIefJi)p9W8zM)_$3xk@?TKv_(s9D*UeAw zFGh-z?q>GjGcZ1}L9M;(QOl0oL*BYlK>p72l{J#&4z8v%KWHv=h;Yr>X zp*i>i14FP+_Jf#)2MoJAJs=MgLp1R6aTa)~jUM4H#F>5T{Z6b}{EM`kIC~NE!u45_ zdtvv(Snh}QH%_lQgJVs5k8cI>P1Ntb|J3rZ)5xpNIe z2)LyAqt}^{ikKqbtBODU*}wRWzSU8Ro&r*t3SBUNvst${q<5`}&SX^H75f}y!}P@E zXX@X%RlPDDuZp~>?SXtE*Yj5?e-cl@1iyU#4M2b95JstB1^(_FFb%VT$SF^2z~=kFs$ zuP!mr2V;OgsU6acm&$aI&7ZZ2v~DQV-ZLO)_b|UEv!NC`Y_V51ReXcB(E&L-VJPsi zq-V4rVzv5BjYEY6w$P;{k2ZdTW$&<9S<-z9GfH}z5$u7|i`nnNn8opRe(ZVA_s{xm zTHD0=G7ed<(ergZ%rHXd(A?^Ajllr# z4}#fl&SCjL+O_bez1uzSJxDn1FUp5-DAwi9yBLi3=o`CRj>!^rNufMT*}3oDgq_3t zgzS6&_+ntRtWDphW%;&0dw|hGZMokm=C{B7`6m`W>fa1be-Fj@0{CLb^zY-p^~QsZ z|1(nxde{HgA8bHf4=bnbe9#2+`(>{8=mgj3#Q{4TZB7pG9KFm8em3qtHg0MT#p0SM zcZdnkPITd6=W7zbj-2&=p5C(6^xcnJ=Eg#QR-Ac_~n|U7dnP)<*t5m5@6F?++n?r);VaDF#^Gx3^bB-6x;}$OGYPE%z z24d_fgYISNx1!Eca6->Hk!YG6!{l4v#wCWs0-^fQ0Qj7d_eFeFE;)&P>brG+G?NP? z>u=iCuY3J=#3+qNfC(CXkii&h=xZ+_nf^H1EIqn9<9C~xBy{yRBQ(s>By&Wk?kY&S zcOfX!J4jj7thCQ*5+v66h!eYcS_rPeo6gOx;24$n<_8bSskWT{7){ zupuylM_A2jQt++P$UXcdJo}4*Aqo_{uK#2-7KyMTNY_$3NKnta-8z9F> z+8}mnA!p;9RW(#bhdHlmX>7pjt4pMFq}AfI-Lpr;3<21Af@fra9?QBWq8F-ssiBE2oYY!YI1vUn2L=MPzl8?8#GDTOxs&IkDphx!D&ojca!d*NNZ*1vB$zIW-&k#!mw z)VqWIX;wz)=3u=^qDe=+#0_Mr{htl0pq4_at*aM5dslBsg*BU-$|j-@oMvu7Vh&cP zLF)_5w_jPL9vp~)ya+OXnQK&Nd`A>v7shh`Se^GqOJVAwa!en^e0#4|nlZezzM&@* zQ^YE_y=A8QLbNS~)+%|D z@pbFasHfSEEOSHML*TuY<6JPI{u;l z3S*`0mj&?@cZ^HwOZDuQZ%Q+p)~k3973A~P0~J@r0V6d#-h?w>mAngiw^vr?+w)(f zKtN6=irFh%@<~qB(2ZFxbFh;7zilz^=ZP7Aym8*r*;# zS= zDWI>YN&@7?2>7q@L@sL5+_{JfIf|}IEv`q-?|)8=J5&TOeYIP(^`dQ+a@=57r+C99 z?B=oJlXH8uPl?&FWExP&7FTd1z@_g)eWc3SX|m2^FMMI$#jyTzzYiz!&Kcdgq3!bX ze~!8>l=gmL9k&RmMA5S~ikAYh({C)_$vn(?&u5b2T^KNzMpUVS*L>VIhqrTJW5X{7 zO_s?c=DPp@;@Zs>0$VcDr?KLFX{SoWbk1CzuI4%liD_kAYEqb@YOG}yhT_@-^nS<1 zmL^iRGxHVsA3%^`W$#bI7v&T5rg@`0OAoQts*B>8SWyOB zFNe)B>lCQ-fYeL$!QD-V6M-_-g!08|VrKF&8>C<{ul!TVg}F>XO)-zv{AF7^jx0BX=nQDP1L*48B3YIXPhp% zkB+7AI~RL!rlIgI2}VU};J6o}~}sQZ;Xt{+;L z>dIeXc;CUC?Pru-g^A0ZkffkU@@2pC-xu0EobQ*>&=0=CT3Tt;tH8L$+2;VCT7$QN@|vTk z%5MO8LkFAMTO%pOSnXj4NCr~Y+kzrviH|R8T6d+buXk*goew&!tQGdjY1jNIOu6S8 z=9-#YH2d%l9PtmZJ`&fKOg=L9P55KS`M|3q5w0;ce*EK2Vc-YSkeEY*V5N>1kNXd8 z%e>Y4Q`xve1a}HIU;eO0L~E$1I7QcZc6iB^D#%BW6K|!FkW$GE&zA7O$TMp#iq*`9 zgXn>Fe1d`kimDoG{n_6p^(z!IpOY|uHd!}gthWxc2!HBp>E-DT{@GuYwsc*S zU8R$M<8d#%JCDzax^t6_&TUyrp^&?9W9s^+sw`HAbc_bEK3inYZ_{}hte$ri z75Hg z+ojjGcuR{lUQ4QCs@@yN7qPtsYZ*SEvWT=!iRO`hK@?$}j#$Ato9DR7)>m~9-VA@Zx1z4CzhA{+3EbNUJyrj6@QX-~o~8%!c@ z?t$~KqIo-|ik(a|YLO;8Ux|#|)cqA0-#k#vRK;eLGk2g9EFK1hu`JTl7Vk@$n9&Ck zycBqxv3>Xx+scONcb{-xqy43Yj`x8%gS0^SLJf0cJpF1o37Pa+pSQnvh5*}L{T zk!Bv$3hqp;)gh#*DPcL$g3JxgaoD{q+^eR9s>DeaCY5~h`lLu{ZB?zN>?k2y!i?vR zBRR$c`%OyrmWev1k3lK{ z@-sC+DF}ZV#(ZFGpwQKed51)30NbGge_sX>^MzbBM^1kZL%u1@7pWC*M_#c%S9)VL zQWZH;OfadXWFGSiiMD>;&54Vi?{{N~8^4|(WAm04>u2;4C$DRjTso53Ij0-Z!_Ot(2a619gM+@ zeJX9Uz?r)1IXmAle%m@d;INfL4o(=#i<(phHY z_P$LiU;dshd@?rB+z7Ip(`0F^`_^27T%W~hq^{D`H(?n}Wqz_=i-4pO zQm`{lBtMldrde4FpyzC&DWzvnMP-TK7PHWmz$sV1Gc;>`tY>Zl+hn~AjcERSsCQ`3 zpw;V>wu?jY!O+F6eOytgAnY{0{7q#N4*jiL54v^1|N0l4uisTPTM)iyH^u=~<*W17 zyYLDf@pIKPHCl?f4nTZw_Fp?!eXK>#Li3zs!EES@4=4raaHe>&CSJ$UkoOrH@+OCl ze+Y@@oUyT&4?cg$3$!Xh-jf#d3zVYW$PL{(DQ({CJo1(@JG4FfAXqTU7z@aw)l4CG z{+VI(aOyDac}3Wn2p8{#Lv6|D+c`p!kdbcGk+cxk0wK?K6^_qp31ozpBf~}c2ULc9 zB+|(&tQokShTGQcv7H54prLThe*@U10-(NWrm|1=403K>GJYW!B~10<};q|rsN=M~1lc(LYm!hS;3E4PqBtk(Eh()b{_ zbXBB5d|7*HhAZO=t*H{`>o3z!$+Wkm-D~8t8ArrUw&_;Hmm{NE*U}sXZ+;&A6FOo4j+5^q9VKwua%Dog(Nmk}`b!<#m+{u*!+VjP z!G%MGLpTk-{_;2{m^h_gFt~8P(z-eUlKH6MQ|Y6>b@t_J-&pUn%*m*N>52_){D)gz zV~@G&zF$=tiolv)Vw@g0y5!TErYlb;S{$`NdV^f0EGQM0*(zqUl#E|*M5Ac|@WVid zT(xTVsjWmcPi4u1UE`k=BBO8~7Gb*{L`%v}BxOeYxo+!NZpNd)KUDL<2ZOh|kXu~2 z1O0*P@laFWGduhX)OoT!BE3W*|0zN^q2P$W8V$nQ&z#YtoiDn@JZk~ohoN)fL-_4m z|JvkF=B^R7zS#-{YjD8h`J+NRCO$rm85~JuEmmpQ{Q!HT7lb=*O@E{S-DA643)lL3 zb-v7WAzO$gP&?>rklI#cw5gE(K|oAjBv(!SmcmP*d#=jg#7^ic2|xA0L7h|4@Lhds z20_WL&TIGN4wx8_VSvqCanR-SU7L6}BI$T~nBg1e%Ur5!KB6WQ>qJwZ&$Q0M-6*jA zo{#tMgznTqkv&U?3m?#u5|%}8)1lLdU4d1n;L2VP`z=2^!U$-jPR`&g?0@(2KcoN_ zi|b~*(F&-WDFnYe)sN`a+KEli=1pV`Yl8txi!QmQ@$N|U#MKeqNymto{+Lg1BVXogT>s%9JX1xtlQRw(Z-Uh*IHOI$JU3$=TR(5UH@MfSO#BdiQlLT>QA=6SB_TmC z%$oJ|cFo6B$0E-oQdE{)0Mr~o+w$eF)!@ljv6RXs@id5v=KicaH{jEpCM&Ta+G>{g zys6KJzX3~5k_xug1nVjf}3 zi$j(sMZw8yhsu=N-@SptF-rP6OoSe;RqV*H@{wxh`9gxogNh}*0}_l!>V9vbSvN-e z?JtNxWS%?WpDaLj42g3lg1 zq%SJv*LHYs1@+!pf>cE>K!FDE zO8y_B!b4lv_H`*K$(sXo6Qn#-vKss#JJMR)nxzTwYU;p>v#+ok>1Ui8%?U(wS8t=+ zphx%h6i?WgJXzp%VklYL@(5DGi~V;o$7$vDUifrjuf$>ue_9KcamnS#mCJ({hpv!xKSw& z9S}T6Y}OqctjFqppY<7onrAL{m=TAg&Kd(Y-zmL2gkB&c$*>?1a6e zfp5{$VLmm(4}y$Bf|@zA1I-)?&jw#=F`tcA+Q|C)AgG^9kPqC!rDc=A^J6v8T}y?o zw`Y`XrT;N8!YP!~J)_x}g1vPgjXB9Nu7Pt(f^Bp*f= z+bjp@yxSD9Zn0>0MVs9EJUd*aUQh#|SCR`<%wpRx4IPL%>wD-POVYLfy;+14sTN&3 zKl}qzmr<0vv>7(5J0#8eQW48`#a|*;tWmDth%Us<``PDt7Ny0t?eyw9@v1JwEfqk_ zNvF=Cl&n3UPrMVQ2I0}Ei_4bmxBK=XT0OR*YG)?AoXwJGknB(^m_#>j+aso~I%QH$ zf>5@h+WIIT_kxpSrRQ194~^%ojFlVQvJD(|aV0St5#X!aqVuYdhEeV-46S2NSniL=BN!@#_;4$gJxK7z!knh4_^g z*zOHo&aDVdDIp;|b4*haNm22)M5rW*DW8EEl0@fF1Wf_(aQr)JPiA(Yn0SNq{rP17_(5AIUz;f{znR%f zVq2{h3M*XmK!|48P&WaG0 z0_fIXk|^of?)XCLVS=6^Dny1khCR81FCBg|USLXS_Y~Oi=*l$33|A{>@Eb5NjW}P? z_I$c|ns1r;)-k`HwR%cI57HbA4xwVa^-h)RFys@v9DP0w(!>%$Q!1m7;T&&14$n-P)IDG3&;P?3{>vQwZ)^AiQ$8Ujj|*@s>}QOOE-h=Oe&U4-J4#;Q))J{XoqCPdk->lvl#A zc4WeSvsT9xxiG-Rr?vvi8^>c~im;wn+d zX>cLq!h3oM@7s!6TU5ko$cj`;8Pm;xjTQZot?28S>g{+@HhzMmHe{mm_Uy8ozm8#=oNbNJA0|7iMQe}RrM(wlD--HWkuch&SZj7flJx*i2d*Ps zdMI@>qs@Tanl|92{#8vsTBEY?Lev;Fn%7WXe&E~vlUd$1BuB}rhP#~aP5NZ<)D}ye zWP?_|5HdxQgt@vlqFY0!3DiL8EFw~+%{>uUmhdE3x8av6NGy%tIUO?ean326Wf^)w zP$e?$;wmD`^^c2R0r|8-Xr)B7JN3+Rp><*KGS0C(Ss4Aw6VAV~uvk>7 z4K@pdu(7dNzJ+A7V|lBS!5r_ewUPMv*E^?IZ?;@9V>UrCTyiY136G?jCClKlleyI} z#i+8&)AS)Ux4FolJn(m zEMHU0C+F{$P1&v&sj#teoLQ=oS#JkufS&P__pPNtOP^r+ z1T#MlWQ*;0M2^i0yCF(wt|T?Y%{~b&m=YEkXwD?#lCQo~seI-pTwd`N2MH?uhxOk9 zCFcz~i=88_`PTy`*bQEd$Epe0zM$k*QbCse@^{h{IfC5R>z$Nyo{#BxU75LM&`M`;~tI9 zuyj6Kc`G1>8uxmDjd89q`X|uu5nO@*qvdq8Eu)4cBwja61@~VJ{&zj9d;#Q}#4NWq zYcVx;Y8c`8Z*+C{?U(nfg>a3Y9nZf+jGc7%zex1o;a|S2p}9;&!4qpghvFdV@qc^& ztw$HnrTHdja#pk40=k%3;^^rn4!m3-r}uSAXiCj9oQ+%Ka=3E>(<;ehLZLBq|MSpa z_L%UWj(*Mml0`i+{oCV@k|DrA%?2%oHy@og$)NL_fOh_N`CHLrVeA+GW8?#AF-8?J zF}_wTi5|P_-#@O;cE@9=%nBu&hN|=V(qVn^W!3J;xq)XI6-W5hp`rn!jnmKD4OFfi&uue^N`%a zX2eO76o5Ttb+vHdEJ=9SYK##^K%>tr#Kz&%0$nzi;JRbN;GYZ>P7kvGZ*@RLn&@eF zh5JGn<(1TCBw8*|acz0o%+G$bDOw2j`Z{ztI#Q>Zf`$vA-DrHgSOJR9>CX=wBV9rF zH_y6TNsW`o+{S)*Ujf_Xf2?U(Z&yS(nzCO5mBda zxn2ke5mde@I_bJ$Wp<+PRdO^BBfA1YOW@MNmuhIS_kTnL0-RD2L0)DzXw7kEU+V6U zJIRFMr|v@KAC*%7siI4@w#@?0QUY=Hvxj|~KNT5pan)bbGJ>Z~qeO|AL`YcYELx2% zrUpH{R4r?p2c%wpGG+5()c+AL$5`QL+N_nEu@J+@rmtZ`zsJQz9NMv>F=696%p*OO z^4WNBD`-D41Tt(@8Y3Mjr>NY2ToDgvSRsGr51`}YMvDpl+ihkVUv<%Ob~VV`%0?jk zE0VbNVxhhg8p8*1&Og|$VxN$QE%)AR%C?iu3{>xpM7u+r*A7Ww>;8uq%D0~L-#0fa z?U_Y(ttf}$ozc~-ICC%}!fC!GUx_GYM)r}xjnS2j<_*sIF(>vyeRw*UOmdg}%2Zbp z>>K+O^;&Lz_iv#UrMHLS!d%r#2Wk7BYmR(twWRfJ@2DK7PJ8yX6L;W7OMU0g{inwq z2zVo62_e?xsOGauW8$oe|=P|Esz43Tt9p`?y^J5he5@ z(u)ETgwRo>DP6kKgwTsX1VhIHxIsck0s;b|gertYYEYyIQiRZ}^xhF^!Z!ijy7xI3 z=klAI%rj3iS((gw+wc9)dTkX3U#whZ`D}h8zPa{`3H_y<8)xbT_TdCxrMlUrFGG}P z^jj`VPhnjl2_WKa^^-@&C%{MKXeFbs_0NDV?r}>q;m^-gb3$fStXw~z_jAYbY_V#g zMikwtkR$a?3W5r>uezr7mY+8;orM+hc&=!6wmo|>0#yeJ^Z7U7VB>uk($`Hc?{phZ z?-oJRWMC(dgcu7XD*o|v2}=&-*jH$Hy~69}tS-UP2yx>?0m>CJwvFkcG>P2Y%5%sk zIjW450D;UuXnaXG_l#JnSJ`*M&(5B`?#Q%LCu=^I6cT??NI>eDnc=c+sh-wPd_Ff5A#`O(q*l$V)ofjc-2K z2aBMe!$JEz7r|!&#ENGc^zpkexq%mdjrcLqt~gJ}w3_~eS(R+dPo_bSbL9#HGjUl! z22ilG`2P6G@sL)zN^qm|9HL3oM3iy>r`tE&G=Mu6`}eB*)iHdxXhpCrd&kwg5ii z;j{a(vA1~Q247UNtkg~jmQcOK2J17p+NG{8^QOqJe{=<^?nR6{PkDO#!lu|e9F418 zR0{L|o@7Ap-nQJg$J$5;ccA0TF2*9s8(v~Pk5x(yZ{OHPkh3{DP#X?dHwO!I@=`>K zJ2E93vStq><@R+&W{MQ+4GIjWi)gP%kD)8gB6lK~O*No83Htv;_Y@2~xs>2F}t z_=734{p@adbE8e=)yMDh`LX<9qSsfAxrY5xZpet3nH7@K*bLWTE zv~l5YJWD6~M#cC>;u_Q-O+C5< zS|OviQb)8lJ}|d)+xX2o`b7L0Q2B_|tw^DQj+>Fp202w-&h3=+u7S(uFY3Jc9o5rP zYQVAJ`uJo=d?YMqN*y9jYbEL<@a< z5*|$}U~*ksXWDpZh!5IrvjtHgXRJ1NG$aJ_413}bP9+Bfu162={2I38rh*w-y%uc7wG_u~G^ZJV0yY zf-e64y!bD)TJQ_4Zv6wTewRDAS{kPgEwx121KH@SqzLZHh;QGcxY1>!&%#3QV6>+L zZ@JE8r!0^0k8$!E*I%7QEPPO}MF^D&vshoYeRZ5@Qx{L`czcrU*d0bz$3DoeD?(yA zOJu|)ZnJ=;3z{^&KB4z)EN?M%mxzJg)ALoPC~eK9?j;G*V4Ob(*=DQQ@OE@iRHTboeC0- zDjRjK*Q7EQ&dMu%n#v|epWwjTqRfi@qDp|8+q;Z=e^yf$xNZMPB_t;JazjNCh3hBY z?D+iVSmF%i9;S`*@jkY#c;zR)N1uIuj^zwaQ9EFEt7o2ok>T@xNQaz>Am*C-m15Ri zKM45GVb&(B3^*2qgJF@`y^)vA53C5%tf7YghMs8$e6{jJu*M^EUbzJyH=JH4O(@4o2%c>Z~7Q&|1PSG7sgy4veLv|DGni4M0+4*zai+PP4 zOo0l>mAvvvd1Nxi;PE(h)NrJqWTC|6ps|*3x5gE$X|D~ALmO>Odr4P=8_ZT8Srezx zMZ5sl&gQZ8(3{&ng$0T|-LlLEUto#@Addu}vL$3;Lr|J7_0Dk8bXzk7Fzzf1O>m8l zclF@)h@4rOs2X^Ow2PE%V~o&hej!YBcXhPz`Km`4yd%b3hpQYsL?dC@9u$qbi?O)S z9kobukp7Y7ZVoWy=3F}Z!LlP`PJH&Zv6>0FMjx$hz~?3lG{3}0VsK9gJ#JrYT+_Rl zu-6doKDSThL^m*`6jxVCWXokhWBDn?3ic;RyU`q=H!#39?wTm?HkN(qta|gmkvW^o zEh-jGqkLXz}h_=1p^(bL$EMz-a_l$bo{76!-F+BKK)#tYceh|6;m{a zx4#}x;6B(sfba_$;A}lU(Cgqq#KjNP0!>|lx-+||xuB>-@dOtXdGIKSFdMaf=U#h` zqBCgqYGNxSm)|3F3Zjt+jComg`Ql>D=7b;r;QpC(xPYbai~_jI>wVC!?`{)hO{aKG2^Wt`>8q-(1rgOM+bX^D1q?@E5f04Bk< zZ`WyeHEhj%54$1K=>o3TzP=;HyOE@^hf(nPMxeRdX50FYJ5SGZVx1SUd3 z{Q@_lNVL$vaD&*03{(;}R*p|IK3(52Hw}GP;5YCmY#KUIuPZ1Fwl*Hnbm_q;y~hg6 zg(GF^G`&Cc4^ag(#76H?F5TS1vvVv6{klTyq+Olo4oB*ryu1Q5=O>@PFrrEAR0iUo zOrsJ(^kYrI@M+BpmbmLvHu@wT76}5$!1#`NQ5eO|;d7{A+8w(WK0gVG5B)Bxrid~%(A35H}f9Q za?{~oP0!qrc3vb?xrhwbEO1vWI3Ht1C5UX-NrsZ<^*W%$*7>=NOeoXE8#mMAH1s5K z%<)#57hcz=W!upupw+vtW0Q-B$xH`XEtG}DU*?h~G5$L@_iB(dKPxaQ0@kyvm_WK$#_(_byz)HX@5w}!cKEf?2B!AH~rFZPm$$5Ru>i={} zV>AMKmbJ~rd{VI1ZXk1{X@W30Ui%VG&A@zHIJE6;`Y33#cp>w@2M05O|G`S0Dx9p? z+TXhq$|Ah#8@Vp`o2xMWn&+M;=YG4n&5aa4DHjCF0o$%T95`X#VLVe_f3BNNq$uZbiyxC z()u~N-q#(rit7Q`VbOe6^fN_G72h&mdtnxz*XZLS99b7tJoS&DPR2|UMHLETiV8%4 zcqu$!s(nzM40$n_;>!xjcoF;8T!G>g&q1&4kQAeb;p!E$O}ImLIIjn=76!@KowUyI ztC2&-k&uM^%0;1DqV<|SZ%}=O@wg1Gj2Y$N%MdUkvp>y3$`IRYqv0PQ?vawC&`#PX zb3I3Z8>5;PQS0XDKDL_4VRIqQMuw0YqfUMG&a+$2&~>A`#vh&Ulo>~rW84m-fGr0F z4A<>ksqr`cl7PS9tK(e@F0c81#7asPTn|x6+t*SK+T!2|a*v#v^ZYn>sfPQ8&35T2 zf1w^2Vz=4@y^(Q+XikoYXf|(clQ!E(CyN0dT~0LW#xOgeh=%cvd6LQ%;al$2|t3%IIR}B5;!9l-}Y8m zwymbpZEYLg_%R$uYD&Md+cz4o7`gZ@+qPd4v$wH{z!J*p4nmW1Ur;i>8qYApo961| z*(Ovkgq|?a+P>%ZTEq+rAR|FhaaepXxTxb51^;}L3tffc-0p3wg?M>Og(<#^hcom9 zBCH(E!xvX2m&25ImkNr`Zo=U*p3aHFbR!*SDJdJ`A5W!8DeUPry0T4+yNWCmv-IU>B>Rc6zTJso+9WVWx14L2G%2D=Vw+8Slg zh6gx!iH0l=){OhGIr={9|B_Bj7G~1_Vb2xnzLj(cqS%W^ONdV!r~;PYYVhu-P{LjF zng}C9)a)*8Pk?65(3IKV+t|-=xf&Y=zh>XLcrqh7KXR+tEs)usTZtcXPNr{X5?uI^VDYltQ<4~?$b$&>1M>NPQ#jb8^GXA){g^Fc zO#c3^X|FrxI>pUzQHh64y`lzGR&(h@DX20QE!G6$c9O|a83!P>Xw z=XdHmHBi;Rh&sCrt3K@_*AQGyd0XvdvuW2Cp1TG*OK^|8l>6-VU#9Q(7e$e`)W(VE z&CKCLten3Jys9a4=4WGbPfWTQ36xem*^KAQTV60HX~s%}^s{d`Jo{+IRD12+!@4XxlVLk&O+-Xso?g3i)hNkF_EV?VM% zc#>SvyWqCMF5gdX48AORKmL~FxTY6HORw`gl{!$+kY6l-6*PJ$5?y<_yXY?^i{f9l zZYh%Wp6Vw-4eL{6U(x{Yd43-_fC9399u5WGYS~w~p1rzXlgOj|0JGSEbpLYwkIKLG zV;(*S?|ZWkyeNEnZ42|YF=(3~fxT(_7Wq(xu38lf$XJbOF)y9KLT`!RNdEQ?< zP*58d5o(B5CzDfBZI3ybu{+_j`|x`phn4q#wW4)~WEFQ~1uO)5lJwvIzbx-{R~4 z^6q#Kg9LnYxqOM6WO}p?y9RNvzH{QODIhEj(|kXuKzZcfzi}qZt{<*ySP=~U@=NZj!5%0o`9Du0~h*iAvQGMBM49wf&u`ukR_FHPpC zaU6HfInONRy*ceOSGvA!)v9ERig*MgrwH9f0;k?lYn2b^S?5~z+}dG`}OGmNw6 zCw^=IBtBj)J;;zMR>!{{woASB(Gw literal 0 HcmV?d00001 From a420f9083ea9993152a5221b8875c34470561f13 Mon Sep 17 00:00:00 2001 From: Ryan Langman Date: Fri, 2 Jun 2023 14:24:43 -0700 Subject: [PATCH 005/123] [TTS] Implement new vocoder dataset (#6670) * [TTS] Implement new vocoder dataset Signed-off-by: Ryan * [TTS] Redo config structure, minor fixes Signed-off-by: Ryan * [TTS] Fix alignment logging Signed-off-by: Ryan * [TTS] Fix script usage example Signed-off-by: Ryan * [TTS] Fixed epoch LR scheduling Signed-off-by: Ryan * [TTS] Support .nemo checkpoint in FP callback Signed-off-by: Ryan * [TTS] Remove align interpolator Signed-off-by: Ryan * [TTS] Remove HiFi-GAN defaults list interpolation Signed-off-by: Ryan * [TTS] Rename weighted_sample_steps to weighted_sampling_steps_per_epoch Signed-off-by: Ryan --------- Signed-off-by: Ryan --- .../{fastpitch_22050.yaml => fastpitch.yaml} | 28 +-- examples/tts/conf/hifigan/hifigan_data.yaml | 133 ++++++++++++ .../tts/conf/hifigan/sample/sample_22050.yaml | 3 + .../tts/conf/hifigan/sample/sample_44100.yaml | 3 + .../asr/parts/preprocessing/segment.py | 1 + .../tts/data/text_to_speech_dataset.py | 64 +++--- nemo/collections/tts/data/vocoder_dataset.py | 202 ++++++++++++++++++ nemo/collections/tts/models/hifigan.py | 195 +++++++++++------ nemo/collections/tts/parts/utils/callbacks.py | 47 +++- nemo/core/optim/lr_scheduler.py | 1 + .../tts/preprocess_audio.py | 2 +- .../dataset_processing/tts/preprocess_text.py | 4 +- 12 files changed, 554 insertions(+), 129 deletions(-) rename examples/tts/conf/fastpitch/{fastpitch_22050.yaml => fastpitch.yaml} (90%) create mode 100644 examples/tts/conf/hifigan/hifigan_data.yaml create mode 100644 examples/tts/conf/hifigan/sample/sample_22050.yaml create mode 100644 examples/tts/conf/hifigan/sample/sample_44100.yaml create mode 100644 nemo/collections/tts/data/vocoder_dataset.py diff --git a/examples/tts/conf/fastpitch/fastpitch_22050.yaml b/examples/tts/conf/fastpitch/fastpitch.yaml similarity index 90% rename from examples/tts/conf/fastpitch/fastpitch_22050.yaml rename to examples/tts/conf/fastpitch/fastpitch.yaml index 4022e8e91c97..1d552d058d76 100644 --- a/examples/tts/conf/fastpitch/fastpitch_22050.yaml +++ b/examples/tts/conf/fastpitch/fastpitch.yaml @@ -1,12 +1,15 @@ -# This config contains the default values for training a FastPitch model with aligner. +# This config contains the default values for training an English FastPitch model. # If you want to train a model on other dataset, you can change config values according to your dataset. # Most dataset-specific arguments are in the head of the config file, see below. name: FastPitch +defaults: + - feature: ??? + max_epochs: ??? batch_size: 32 -weighted_sample_steps: null +weighted_sampling_steps_per_epoch: null n_speakers: ??? speaker_path: null @@ -24,9 +27,6 @@ vocoder_type: ??? vocoder_name: null vocoder_checkpoint_path: null -defaults: - - feature: feature_22050 - model: learn_alignment: true bin_loss_warmup_epochs: 100 @@ -44,6 +44,7 @@ model: dur_loss_scale: 0.1 pitch_loss_scale: 0.1 energy_loss_scale: 0.1 + aligner_loss_scale: 0.1 preprocessor: _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor @@ -79,7 +80,6 @@ model: phoneme_dict: ${phoneme_dict_path} heteronyms: ${heteronyms_path} phoneme_probability: 0.8 - # Relies on the heteronyms list for anything that needs to be disambiguated ignore_ambiguous_words: false use_chars: true use_stresses: true @@ -94,30 +94,24 @@ model: field: energy stats_path: ${feature_stats_path} - align_prior_config: - _target_: nemo.collections.tts.data.text_to_speech_dataset.AlignPriorConfig - hop_length: ${feature.hop_length} - use_beta_binomial_interpolator: false - train_ds: dataset: _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset dataset_meta: ${train_ds_meta} - weighted_sample_steps: ${weighted_sample_steps} + weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} sample_rate: ${feature.sample_rate} speaker_path: ${speaker_path} + align_prior_hop_length: ${feature.hop_length} featurizers: ${feature.featurizers} feature_processors: pitch: ${model.pitch_processor} energy: ${model.energy_processor} - align_prior_config: ${model.align_prior_config} min_duration: 0.1 max_duration: 10.0 dataloader_params: batch_size: ${batch_size} - drop_last: true - num_workers: 8 + num_workers: 4 validation_ds: dataset: @@ -125,11 +119,11 @@ model: dataset_meta: ${val_ds_meta} sample_rate: ${feature.sample_rate} speaker_path: ${speaker_path} + align_prior_hop_length: ${feature.hop_length} featurizers: ${feature.featurizers} feature_processors: pitch: ${model.pitch_processor} energy: ${model.energy_processor} - align_prior_config: ${model.align_prior_config} dataloader_params: batch_size: ${batch_size} @@ -158,7 +152,7 @@ model: text_tokenizer: ${model.text_tokenizer} sample_rate: ${feature.sample_rate} speaker_path: ${speaker_path} - align_prior_config: ${model.align_prior_config} + align_prior_hop_length: ${feature.hop_length} featurizers: ${feature.featurizers} feature_processors: diff --git a/examples/tts/conf/hifigan/hifigan_data.yaml b/examples/tts/conf/hifigan/hifigan_data.yaml new file mode 100644 index 000000000000..fde2f169aa8d --- /dev/null +++ b/examples/tts/conf/hifigan/hifigan_data.yaml @@ -0,0 +1,133 @@ +# This config contains the default values for training a HiFi-GAN model. +# If you want to train model on other dataset, you can change config values according to your dataset. +# Most dataset-specific arguments are in the head of the config file, see below. + +name: "HifiGan" + +defaults: + - feature: ??? + - sample: ??? + - model/generator: ??? + +max_epochs: ??? +batch_size: 16 +weighted_sampling_steps_per_epoch: null + +train_ds_meta: ??? +val_ds_meta: ??? +log_ds_meta: ??? + +log_dir: ??? + +model: + + max_epochs: ${max_epochs} + steps_per_epoch: ${weighted_sampling_steps_per_epoch} + l1_loss_factor: 60 + + preprocessor: + _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures + nfilt: ${feature.mel_feature.mel_dim} + lowfreq: ${feature.mel_feature.lowfreq} + highfreq: ${feature.mel_feature.highfreq} + n_fft: ${feature.win_length} + n_window_size: ${feature.win_length} + n_window_stride: ${feature.hop_length} + pad_to: 0 + pad_value: 0 + exact_pad: true + sample_rate: ${feature.sample_rate} + window: hann + normalize: null + preemph: null + dither: 0.0 + frame_splicing: 1 + log: true + log_zero_guard_type: add + log_zero_guard_value: 1.0 + mag_power: 1.0 + mel_norm: null + use_grads: false + + train_ds: + dataset: + _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset + weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} + sample_rate: ${feature.sample_rate} + n_samples: ${sample.train_n_samples} + min_duration: 0.4 + max_duration: null + dataset_meta: ${train_ds_meta} + + dataloader_params: + batch_size: ${batch_size} + num_workers: 4 + + validation_ds: + dataset: + _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset + sample_rate: ${feature.sample_rate} + n_samples: ${sample.val_n_samples} + min_duration: 3.0 + max_duration: null + dataset_meta: ${val_ds_meta} + + dataloader_params: + batch_size: ${batch_size} + num_workers: 2 + + log_config: + log_dir: ${log_dir} + log_epochs: [10, 50] + epoch_frequency: 100 + log_tensorboard: false + log_wandb: false + + generators: + - _target_: nemo.collections.tts.parts.utils.callbacks.VocoderArtifactGenerator + + dataset: + _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset + sample_rate: ${feature.sample_rate} + n_samples: null + min_duration: null + max_duration: null + dataset_meta: ${log_ds_meta} + + dataloader_params: + batch_size: 4 + num_workers: 2 + + optim: + _target_: torch.optim.AdamW + lr: 2e-4 + betas: [0.8, 0.99] + weight_decay: 1e-6 + sched: + name: ExponentialLR + gamma: 0.999 + +trainer: + num_nodes: 1 + devices: 1 + accelerator: gpu + strategy: ddp + precision: 16 + max_epochs: ${max_epochs} + accumulate_grad_batches: 1 + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + log_every_n_steps: 100 + check_val_every_n_epoch: 10 + benchmark: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + create_wandb_logger: false + checkpoint_callback_params: + monitor: val_loss + resume_if_exists: false + resume_ignore_no_checkpoint: false diff --git a/examples/tts/conf/hifigan/sample/sample_22050.yaml b/examples/tts/conf/hifigan/sample/sample_22050.yaml new file mode 100644 index 000000000000..18bc206e2566 --- /dev/null +++ b/examples/tts/conf/hifigan/sample/sample_22050.yaml @@ -0,0 +1,3 @@ +# Audio dataset sampling config for 22.05khz sampling rate +train_n_samples: 8192 +val_n_samples: 66048 diff --git a/examples/tts/conf/hifigan/sample/sample_44100.yaml b/examples/tts/conf/hifigan/sample/sample_44100.yaml new file mode 100644 index 000000000000..d8315623bbbe --- /dev/null +++ b/examples/tts/conf/hifigan/sample/sample_44100.yaml @@ -0,0 +1,3 @@ +# Audio dataset sampling config for 44.1khz sampling rate +train_n_samples: 16384 +val_n_samples: 131072 diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py index 89458ff4c4f6..d586137d5ff2 100644 --- a/nemo/collections/asr/parts/preprocessing/segment.py +++ b/nemo/collections/asr/parts/preprocessing/segment.py @@ -423,6 +423,7 @@ def segment_from_file( samples = f.read(dtype='float32') except RuntimeError as e: logging.error(f"Loading {audio_file} via SoundFile raised RuntimeError: `{e}`.") + raise e features = cls( samples, sample_rate, target_sr=target_sr, trim=trim, orig_sr=orig_sr, channel_selector=channel_selector diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py index 47868d41d1ec..23ddb50346a2 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset.py +++ b/nemo/collections/tts/data/text_to_speech_dataset.py @@ -25,7 +25,6 @@ from nemo.collections.tts.parts.preprocessing.feature_processors import FeatureProcessor from nemo.collections.tts.parts.preprocessing.features import Featurizer from nemo.collections.tts.parts.utils.tts_dataset_utils import ( - BetaBinomialInterpolator, beta_binomial_prior_distribution, filter_dataset_by_duration, get_abs_rel_paths, @@ -55,12 +54,6 @@ class DatasetSample: speaker_index: int = None -@dataclass -class AlignPriorConfig: - hop_length: int - use_beta_binomial_interpolator: bool = False - - @experimental class TextToSpeechDataset(Dataset): """ @@ -71,15 +64,16 @@ class TextToSpeechDataset(Dataset): sample_rate: Sample rate to load audio as. If the audio is stored at a different sample rate, then it will be resampled. text_tokenizer: Tokenizer to apply to the text field. - weighted_sample_steps: Optional int, If provided, then data will be sampled (with replacement) based on + weighted_sampling_steps_per_epoch: Optional int, If provided, then data will be sampled (with replacement) based on the sample weights provided in the dataset metadata. If None, then sample weights will be ignored. speaker_path: Optional, path to JSON file with speaker indices, for multi-speaker training. Can be created with scripts.dataset_processing.tts.create_speaker_map.py featurizers: Optional, list of featurizers to load feature data from. Should be the same config provided when running scripts.dataset_processing.tts.compute_features.py before training. feature_processors: Optional, list of feature processors to run on training examples. - align_prior_config: Optional, if provided alignment prior will be calculated and included in - batch output. + align_prior_hop_length: Optional int, hop length of audio features. + If provided alignment prior will be calculated and included in batch output. Must match hop length + of audio features used for training. min_duration: Optional float, if provided audio files in the training manifest shorter than 'min_duration' will be ignored. max_duration: Optional float, if provided audio files in the training manifest longer than 'max_duration' @@ -88,14 +82,14 @@ class TextToSpeechDataset(Dataset): def __init__( self, - dataset_meta: Dict[str, DatasetMeta], + dataset_meta: Dict, sample_rate: int, text_tokenizer: BaseTokenizer, - weighted_sample_steps: Optional[int] = None, + weighted_sampling_steps_per_epoch: Optional[int] = None, speaker_path: Optional[Path] = None, featurizers: Optional[Dict[str, Featurizer]] = None, feature_processors: Optional[Dict[str, FeatureProcessor]] = None, - align_prior_config: Optional[AlignPriorConfig] = None, + align_prior_hop_length: Optional[int] = None, min_duration: Optional[float] = None, max_duration: Optional[float] = None, ): @@ -103,7 +97,9 @@ def __init__( self.sample_rate = sample_rate self.text_tokenizer = text_tokenizer - self.weighted_sample_steps = weighted_sample_steps + self.weighted_sampling_steps_per_epoch = weighted_sampling_steps_per_epoch + self.align_prior_hop_length = align_prior_hop_length + self.include_align_prior = self.align_prior_hop_length is not None if speaker_path: self.include_speaker = True @@ -115,26 +111,21 @@ def __init__( if featurizers: logging.info(f"Found featurizers {featurizers.keys()}") - self.featurizers = featurizers.values() + self.featurizers = list(featurizers.values()) else: self.featurizers = [] if feature_processors: logging.info(f"Found featurize processors {feature_processors.keys()}") - self.feature_processors = feature_processors.values() + self.feature_processors = list(feature_processors.values()) else: self.feature_processors = [] - self.align_prior_config = align_prior_config - if self.align_prior_config.use_beta_binomial_interpolator: - self.beta_binomial_interpolator = BetaBinomialInterpolator() - else: - self.beta_binomial_interpolator = None - self.data_samples = [] self.sample_weights = [] - for dataset_name, dataset in dataset_meta.items(): - samples, weights = self._process_dataset( + for dataset_name, dataset_info in dataset_meta.items(): + dataset = DatasetMeta(**dataset_info) + samples, weights = self._preprocess_manifest( dataset_name=dataset_name, dataset=dataset, min_duration=min_duration, @@ -145,15 +136,15 @@ def __init__( self.sample_weights += weights def get_sampler(self, batch_size: int) -> Optional[torch.utils.data.Sampler]: - if not self.weighted_sample_steps: + if not self.weighted_sampling_steps_per_epoch: return None sampler = get_weighted_sampler( - sample_weights=self.sample_weights, batch_size=batch_size, num_steps=self.weighted_sample_steps + sample_weights=self.sample_weights, batch_size=batch_size, num_steps=self.weighted_sampling_steps_per_epoch ) return sampler - def _process_dataset( + def _preprocess_manifest( self, dataset_name: str, dataset: DatasetMeta, @@ -169,8 +160,8 @@ def _process_dataset( logging.info(dataset_name) logging.info(f"Original # of files: {len(entries)}") logging.info(f"Filtered # of files: {len(filtered_entries)}") - logging.info(f"Original duration: {total_hours} hours") - logging.info(f"Filtered duration: {filtered_hours} hours") + logging.info(f"Original duration: {total_hours:.2f} hours") + logging.info(f"Filtered duration: {filtered_hours:.2f} hours") samples = [] sample_weights = [] @@ -219,15 +210,10 @@ def __getitem__(self, index): example["speaker"] = data.speaker example["speaker_index"] = data.speaker_index - if self.align_prior_config: + if self.include_align_prior: text_len = len(tokens) - spec_len = 1 + librosa.core.samples_to_frames( - audio.shape[0], hop_length=self.align_prior_config.hop_length - ) - if self.beta_binomial_interpolator: - align_prior = self.beta_binomial_interpolator(w=spec_len, h=text_len) - else: - align_prior = beta_binomial_prior_distribution(phoneme_count=text_len, mel_count=spec_len) + spec_len = 1 + librosa.core.samples_to_frames(audio.shape[0], hop_length=self.align_prior_hop_length) + align_prior = beta_binomial_prior_distribution(phoneme_count=text_len, mel_count=spec_len) align_prior = torch.tensor(align_prior, dtype=torch.float32) example["align_prior"] = align_prior @@ -265,7 +251,7 @@ def collate_fn(self, batch: List[dict]): if self.include_speaker: speaker_list.append(example["speaker_index"]) - if self.align_prior_config: + if self.include_align_prior: prior_list.append(example["align_prior"]) batch_audio_len = torch.IntTensor(audio_len_list) @@ -288,7 +274,7 @@ def collate_fn(self, batch: List[dict]): if self.include_speaker: batch_dict["speaker_id"] = torch.IntTensor(speaker_list) - if self.align_prior_config: + if self.include_align_prior: spec_max_len = max([prior.shape[0] for prior in prior_list]) text_max_len = max([prior.shape[1] for prior in prior_list]) batch_dict["align_prior_matrix"] = stack_tensors(prior_list, max_lens=[text_max_len, spec_max_len],) diff --git a/nemo/collections/tts/data/vocoder_dataset.py b/nemo/collections/tts/data/vocoder_dataset.py new file mode 100644 index 000000000000..9bb115ba2448 --- /dev/null +++ b/nemo/collections/tts/data/vocoder_dataset.py @@ -0,0 +1,202 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import traceback +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import librosa +import torch.utils.data + +from nemo.collections.asr.parts.preprocessing.segment import AudioSegment +from nemo.collections.asr.parts.utils.manifest_utils import read_manifest +from nemo.collections.tts.parts.preprocessing.feature_processors import FeatureProcessor +from nemo.collections.tts.parts.utils.tts_dataset_utils import ( + filter_dataset_by_duration, + get_abs_rel_paths, + get_weighted_sampler, + stack_tensors, +) +from nemo.core.classes import Dataset +from nemo.utils import logging +from nemo.utils.decorators import experimental + + +@dataclass +class DatasetMeta: + manifest_path: Path + audio_dir: Path + sample_weight: float = 1.0 + + +@dataclass +class DatasetSample: + manifest_entry: dict + audio_dir: Path + + +@experimental +class VocoderDataset(Dataset): + """ + Class for processing and loading Vocoder training examples. + + Args: + dataset_meta: Dict of dataset names (string) to dataset metadata. + sample_rate: Sample rate to load audio as. If the audio is stored at a different sample rate, then it will + be resampled. + n_samples: Optional int, if provided then n_samples samples will be randomly sampled from the full + audio file. + weighted_sampling_steps_per_epoch: Optional int, If provided, then data will be sampled (with replacement) based on + the sample weights provided in the dataset metadata. If None, then sample weights will be ignored. + feature_processors: Optional, list of feature processors to run on training examples. + min_duration: Optional float, if provided audio files in the training manifest shorter than 'min_duration' + will be ignored. + max_duration: Optional float, if provided audio files in the training manifest longer than 'max_duration' + will be ignored. + num_audio_retries: Number of read attempts to make when sampling audio file, to avoid training failing + from sporadic IO errors. + """ + + def __init__( + self, + dataset_meta: Dict, + sample_rate: int, + n_samples: Optional[int] = None, + weighted_sampling_steps_per_epoch: Optional[int] = None, + feature_processors: Optional[Dict[str, FeatureProcessor]] = None, + min_duration: Optional[float] = None, + max_duration: Optional[float] = None, + num_audio_retries: int = 5, + ): + super().__init__() + + self.sample_rate = sample_rate + self.n_samples = n_samples + self.weighted_sampling_steps_per_epoch = weighted_sampling_steps_per_epoch + self.num_audio_retries = num_audio_retries + self.load_precomputed_mel = False + + if feature_processors: + logging.info(f"Found feature processors {feature_processors.keys()}") + self.feature_processors = list(feature_processors.values()) + else: + self.feature_processors = [] + + self.data_samples = [] + self.sample_weights = [] + for dataset_name, dataset_info in dataset_meta.items(): + dataset = DatasetMeta(**dataset_info) + samples, weights = self._preprocess_manifest( + dataset_name=dataset_name, dataset=dataset, min_duration=min_duration, max_duration=max_duration, + ) + self.data_samples += samples + self.sample_weights += weights + + def get_sampler(self, batch_size: int) -> Optional[torch.utils.data.Sampler]: + if not self.weighted_sampling_steps_per_epoch: + return None + + sampler = get_weighted_sampler( + sample_weights=self.sample_weights, batch_size=batch_size, num_steps=self.weighted_sampling_steps_per_epoch + ) + return sampler + + def _segment_audio(self, audio_filepath: Path) -> AudioSegment: + # Retry file read multiple times as file seeking can produce random IO errors. + for _ in range(self.num_audio_retries): + try: + audio_segment = AudioSegment.segment_from_file( + audio_filepath, target_sr=self.sample_rate, n_segments=self.n_samples, + ) + return audio_segment + except Exception: + traceback.print_exc() + + raise ValueError(f"Failed to read audio {audio_filepath}") + + def _sample_audio(self, audio_filepath: Path) -> Tuple[torch.Tensor, torch.Tensor]: + if not self.n_samples: + audio_array, _ = librosa.load(audio_filepath, sr=self.sample_rate) + else: + audio_segment = self._segment_audio(audio_filepath) + audio_array = audio_segment.samples + audio = torch.tensor(audio_array) + audio_len = torch.tensor(audio.shape[0]) + return audio, audio_len + + @staticmethod + def _preprocess_manifest( + dataset_name: str, dataset: DatasetMeta, min_duration: float, max_duration: float, + ): + entries = read_manifest(dataset.manifest_path) + filtered_entries, total_hours, filtered_hours = filter_dataset_by_duration( + entries=entries, min_duration=min_duration, max_duration=max_duration + ) + + logging.info(dataset_name) + logging.info(f"Original # of files: {len(entries)}") + logging.info(f"Filtered # of files: {len(filtered_entries)}") + logging.info(f"Original duration: {total_hours:.2f} hours") + logging.info(f"Filtered duration: {filtered_hours:.2f} hours") + + samples = [] + sample_weights = [] + for entry in filtered_entries: + sample = DatasetSample(manifest_entry=entry, audio_dir=Path(dataset.audio_dir),) + samples.append(sample) + sample_weights.append(dataset.sample_weight) + + return samples, sample_weights + + def __len__(self): + return len(self.data_samples) + + def __getitem__(self, index): + data = self.data_samples[index] + + audio_filepath = Path(data.manifest_entry["audio_filepath"]) + audio_filepath_abs, audio_filepath_rel = get_abs_rel_paths(input_path=audio_filepath, base_path=data.audio_dir) + + audio, audio_len = self._sample_audio(audio_filepath_abs) + + example = {"audio_filepath": audio_filepath_rel, "audio": audio, "audio_len": audio_len} + + for processor in self.feature_processors: + processor.process(example) + + return example + + def collate_fn(self, batch: List[dict]): + audio_filepath_list = [] + audio_list = [] + audio_len_list = [] + + for example in batch: + audio_filepath_list.append(example["audio_filepath"]) + audio_list.append(example["audio"]) + audio_len_list.append(example["audio_len"]) + + batch_audio_len = torch.IntTensor(audio_len_list) + audio_max_len = int(batch_audio_len.max().item()) + + batch_audio = stack_tensors(audio_list, max_lens=[audio_max_len]) + + batch_dict = { + "audio_filepaths": audio_filepath_list, + "audio": batch_audio, + "audio_lens": batch_audio_len, + } + + return batch_dict diff --git a/nemo/collections/tts/models/hifigan.py b/nemo/collections/tts/models/hifigan.py index b7ab37e6589e..bf2eef33cdcf 100644 --- a/nemo/collections/tts/models/hifigan.py +++ b/nemo/collections/tts/models/hifigan.py @@ -13,6 +13,7 @@ # limitations under the License. import itertools +from pathlib import Path import torch import torch.nn.functional as F @@ -23,12 +24,13 @@ from nemo.collections.tts.losses.hifigan_losses import DiscriminatorLoss, FeatureMatchingLoss, GeneratorLoss from nemo.collections.tts.models.base import Vocoder from nemo.collections.tts.modules.hifigan_modules import MultiPeriodDiscriminator, MultiScaleDiscriminator +from nemo.collections.tts.parts.utils.callbacks import LoggingCallback from nemo.collections.tts.parts.utils.helpers import get_batch_size, get_num_workers, plot_spectrogram_to_numpy from nemo.core.classes import Exportable from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.core.neural_types.elements import AudioSignal, MelSpectrogramType from nemo.core.neural_types.neural_type import NeuralType -from nemo.core.optim.lr_scheduler import CosineAnnealing, compute_max_steps +from nemo.core.optim.lr_scheduler import compute_max_steps, prepare_lr_scheduler from nemo.utils import logging, model_utils HAVE_WANDB = True @@ -47,6 +49,7 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) + self.ds_class = cfg.train_ds.dataset._target_ super().__init__(cfg=cfg, trainer=trainer) @@ -69,9 +72,22 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): if self._train_dl: self.input_as_mel = self._train_dl.dataset.load_precomputed_mel + self.log_audio = cfg.get("log_audio", False) + self.log_config = cfg.get("log_config", None) + self.lr_schedule_interval = None self.automatic_optimization = False - def _get_max_steps(self): + @property + def max_steps(self): + if "max_steps" in self._cfg: + return self._cfg.get("max_steps") + + if "max_epochs" not in self._cfg: + raise ValueError("Must specify 'max_steps' or 'max_epochs'.") + + if "steps_per_epoch" in self._cfg: + return self._cfg.max_epochs * self._cfg.steps_per_epoch + return compute_max_steps( max_epochs=self._cfg.max_epochs, accumulate_grad_batches=self.trainer.accumulate_grad_batches, @@ -84,16 +100,13 @@ def _get_max_steps(self): @staticmethod def get_warmup_steps(max_steps, warmup_steps, warmup_ratio): - if warmup_steps is not None and warmup_ratio is not None: - raise ValueError(f'Either use warmup_steps or warmup_ratio for scheduler') - if warmup_steps is not None: return warmup_steps if warmup_ratio is not None: return warmup_ratio * max_steps - raise ValueError(f'Specify warmup_steps or warmup_ratio for scheduler') + return None def configure_optimizers(self): optim_config = self._cfg.optim.copy() @@ -102,42 +115,47 @@ def configure_optimizers(self): sched_config = optim_config.pop("sched", None) OmegaConf.set_struct(optim_config, True) - optim_g = instantiate(optim_config, params=self.generator.parameters(),) - optim_d = instantiate(optim_config, params=itertools.chain(self.msd.parameters(), self.mpd.parameters()),) - - # Backward compatibility - if sched_config is None and 'sched' in self._cfg: - sched_config = self._cfg.sched - - if sched_config is not None: - max_steps = self._cfg.get("max_steps", None) - if max_steps is None or max_steps < 0: - max_steps = self._get_max_steps() - - warmup_steps = HifiGanModel.get_warmup_steps( - max_steps=max_steps, - warmup_steps=sched_config.get("warmup_steps", None), - warmup_ratio=sched_config.get("warmup_ratio", None), - ) - - scheduler_g = CosineAnnealing( - optimizer=optim_g, max_steps=max_steps, min_lr=sched_config.min_lr, warmup_steps=warmup_steps, - ) # Use warmup to delay start - sch1_dict = { - 'scheduler': scheduler_g, - 'interval': 'step', - } - - scheduler_d = CosineAnnealing(optimizer=optim_d, max_steps=max_steps, min_lr=sched_config.min_lr,) - sch2_dict = { - 'scheduler': scheduler_d, - 'interval': 'step', - } - - return [optim_g, optim_d], [sch1_dict, sch2_dict] - else: + gen_params = self.generator.parameters() + disc_params = itertools.chain(self.msd.parameters(), self.mpd.parameters()) + optim_g = instantiate(optim_config, params=gen_params) + optim_d = instantiate(optim_config, params=disc_params) + + if sched_config is None: return [optim_g, optim_d] + max_steps = self.max_steps + warmup_steps = self.get_warmup_steps( + max_steps=max_steps, + warmup_steps=sched_config.get("warmup_steps", None), + warmup_ratio=sched_config.get("warmup_ratio", None), + ) + + OmegaConf.set_struct(sched_config, False) + sched_config["max_steps"] = max_steps + if warmup_steps: + sched_config["warmup_steps"] = warmup_steps + sched_config.pop("warmup_ratio", None) + OmegaConf.set_struct(sched_config, True) + + scheduler_g = prepare_lr_scheduler( + optimizer=optim_g, scheduler_config=sched_config, train_dataloader=self._train_dl + ) + + scheduler_d = prepare_lr_scheduler( + optimizer=optim_d, scheduler_config=sched_config, train_dataloader=self._train_dl + ) + + self.lr_schedule_interval = scheduler_g["interval"] + + return [optim_g, optim_d], [scheduler_g, scheduler_d] + + def update_lr(self, interval="step"): + schedulers = self.lr_schedulers() + if schedulers is not None and self.lr_schedule_interval == interval: + sch1, sch2 = schedulers + sch1.step() + sch2.step() + @typecheck() def forward(self, *, spec): """ @@ -153,12 +171,7 @@ def convert_spectrogram_to_audio(self, spec: 'torch.tensor') -> 'torch.tensor': return self(spec=spec).squeeze(1) def training_step(self, batch, batch_idx): - if self.input_as_mel: - # Pre-computed spectrograms will be used as input - audio, audio_len, audio_mel = batch - else: - audio, audio_len = batch - audio_mel, _ = self.audio_to_melspec_precessor(audio, audio_len) + audio, audio_len, audio_mel, _ = self._process_batch(batch) # Mel as input for L1 mel loss audio_trg_mel, _ = self.trg_melspec_fn(audio, audio_len) @@ -196,12 +209,7 @@ def training_step(self, batch, batch_idx): self.manual_backward(loss_g) optim_g.step() - # Run schedulers - schedulers = self.lr_schedulers() - if schedulers is not None: - sch1, sch2 = schedulers - sch1.step() - sch2.step() + self.update_lr() metrics = { "g_loss_fm_mpd": loss_fm_mpd, @@ -218,18 +226,13 @@ def training_step(self, batch, batch_idx): self.log_dict(metrics, on_step=True, sync_dist=True) self.log("g_l1_loss", loss_mel, prog_bar=True, logger=False, sync_dist=True) + def training_epoch_end(self, outputs) -> None: + self.update_lr("epoch") + def validation_step(self, batch, batch_idx): - if self.input_as_mel: - audio, audio_len, audio_mel = batch - audio_mel_len = [audio_mel.shape[1]] * audio_mel.shape[0] - else: - audio, audio_len = batch - audio_mel, audio_mel_len = self.audio_to_melspec_precessor(audio, audio_len) - audio_pred = self(spec=audio_mel) + audio, audio_len, audio_mel, audio_mel_len = self._process_batch(batch) - # Perform bias denoising - pred_denoised = self._bias_denoise(audio_pred, audio_mel).squeeze(1) - pred_denoised_mel, _ = self.audio_to_melspec_precessor(pred_denoised, audio_len) + audio_pred = self(spec=audio_mel) if self.input_as_mel: gt_mel, gt_mel_len = self.audio_to_melspec_precessor(audio, audio_len) @@ -239,7 +242,11 @@ def validation_step(self, batch, batch_idx): self.log_dict({"val_loss": loss_mel}, on_epoch=True, sync_dist=True) # Plot audio once per epoch - if batch_idx == 0 and isinstance(self.logger, WandbLogger) and HAVE_WANDB: + if self.log_audio and batch_idx == 0 and isinstance(self.logger, WandbLogger) and HAVE_WANDB: + # Perform bias denoising + pred_denoised = self._bias_denoise(audio_pred, audio_mel).squeeze(1) + pred_denoised_mel, _ = self.audio_to_melspec_precessor(pred_denoised, audio_len) + clips = [] specs = [] for i in range(min(5, audio.shape[0])): @@ -284,6 +291,21 @@ def validation_step(self, batch, batch_idx): self.logger.experiment.log({"audio": clips, "specs": specs}) + def _process_batch(self, batch): + if self.input_as_mel: + audio, audio_len, audio_mel = batch + audio_mel_len = [audio_mel.shape[1]] * audio_mel.shape[0] + return audio, audio_len, audio_mel, audio_mel_len + + if self.ds_class == "nemo.collections.tts.data.vocoder_dataset.VocoderDataset": + audio = batch.get("audio") + audio_len = batch.get("audio_lens") + else: + audio, audio_len = batch + + audio_mel, audio_mel_len = self.audio_to_melspec_precessor(audio, audio_len) + return audio, audio_len, audio_mel, audio_mel_len + def _bias_denoise(self, audio, mel): def stft(x): comp = torch.stft(x.squeeze(1), n_fft=1024, hop_length=256, win_length=1024, return_complex=True) @@ -311,6 +333,19 @@ def istft(mags, phase): return audio_denoised + def _setup_train_dataloader(self, cfg): + dataset = instantiate(cfg.dataset) + sampler = dataset.get_sampler(cfg.dataloader_params.batch_size) + data_loader = torch.utils.data.DataLoader( + dataset, collate_fn=dataset.collate_fn, sampler=sampler, **cfg.dataloader_params + ) + return data_loader + + def _setup_test_dataloader(self, cfg): + dataset = instantiate(cfg.dataset) + data_loader = torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params) + return data_loader + def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, name: str = "train"): if "dataset" not in cfg or not isinstance(cfg.dataset, DictConfig): raise ValueError(f"No dataset for {name}") @@ -333,14 +368,44 @@ def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, na return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params) def setup_training_data(self, cfg): - self._train_dl = self.__setup_dataloader_from_config(cfg) + if self.ds_class == "nemo.collections.tts.data.vocoder_dataset.VocoderDataset": + self._train_dl = self._setup_train_dataloader(cfg) + else: + self._train_dl = self.__setup_dataloader_from_config(cfg) def setup_validation_data(self, cfg): - self._validation_dl = self.__setup_dataloader_from_config(cfg, shuffle_should_be=False, name="validation") + if self.ds_class == "nemo.collections.tts.data.vocoder_dataset.VocoderDataset": + self._validation_dl = self._setup_test_dataloader(cfg) + else: + self._validation_dl = self.__setup_dataloader_from_config(cfg, shuffle_should_be=False, name="validation") def setup_test_data(self, cfg): pass + def configure_callbacks(self): + if not self.log_config: + return [] + + sample_ds_class = self.log_config.dataset._target_ + if sample_ds_class != "nemo.collections.tts.data.vocoder_dataset.VocoderDataset": + raise ValueError(f"Sample logging only supported for VocoderDataset, got {sample_ds_class}") + + data_loader = self._setup_test_dataloader(self.log_config) + generators = instantiate(self.log_config.generators) + log_dir = Path(self.log_config.log_dir) if self.log_config.log_dir else None + log_callback = LoggingCallback( + generators=generators, + data_loader=data_loader, + log_epochs=self.log_config.log_epochs, + epoch_frequency=self.log_config.epoch_frequency, + output_dir=log_dir, + loggers=self.trainer.loggers, + log_tensorboard=self.log_config.log_tensorboard, + log_wandb=self.log_config.log_wandb, + ) + + return [log_callback] + @classmethod def list_available_models(cls) -> 'Optional[Dict[str, str]]': list_of_models = [] diff --git a/nemo/collections/tts/parts/utils/callbacks.py b/nemo/collections/tts/parts/utils/callbacks.py index 0f8bd0fa4177..2320e5b21a7c 100644 --- a/nemo/collections/tts/parts/utils/callbacks.py +++ b/nemo/collections/tts/parts/utils/callbacks.py @@ -22,6 +22,7 @@ import numpy as np import soundfile as sf import torch +from einops import rearrange from pytorch_lightning import Callback, LightningModule, Trainer from pytorch_lightning.loggers import TensorBoardLogger from pytorch_lightning.loggers.logger import Logger @@ -65,11 +66,13 @@ def _load_vocoder(model_name: Optional[str], checkpoint_path: Optional[str], typ raise ValueError(f"Unknown vocoder type '{type}'") if model_name is not None: - vocoder = model_type.from_pretrained(model_name).eval() + vocoder = model_type.from_pretrained(model_name) + elif checkpoint_path.endswith(".nemo"): + vocoder = model_type.restore_from(checkpoint_path) else: - vocoder = model_type.load_from_checkpoint(checkpoint_path).eval() + vocoder = model_type.load_from_checkpoint(checkpoint_path) - return vocoder + return vocoder.eval() @dataclass @@ -229,6 +232,39 @@ def on_train_epoch_end(self, trainer: Trainer, model: LightningModule): self._log_image(image=image, log_dir=log_dir, step=model.global_step) +class VocoderArtifactGenerator(ArtifactGenerator): + """ + Generator for logging Vocoder model outputs. + """ + + def generate_artifacts( + self, model: LightningModule, batch_dict: Dict + ) -> Tuple[List[AudioArtifact], List[ImageArtifact]]: + + audio_artifacts = [] + + audio_filepaths = batch_dict.get("audio_filepaths") + audio_ids = [create_id(p) for p in audio_filepaths] + + audio = batch_dict.get("audio") + audio_len = batch_dict.get("audio_lens") + + spec, spec_len = model.audio_to_melspec_precessor(audio, audio_len) + + with torch.no_grad(): + audio_pred = model.forward(spec=spec) + audio_pred = rearrange(audio_pred, "B 1 T -> B T") + + for i, audio_id in enumerate(audio_ids): + audio_pred_i = audio_pred[i][: audio_len[i]].cpu().numpy() + audio_artifact = AudioArtifact( + id=f"audio_{audio_id}", data=audio_pred_i, filename=f"{audio_id}.wav", sample_rate=model.sample_rate, + ) + audio_artifacts.append(audio_artifact) + + return audio_artifacts, [] + + class FastPitchArtifactGenerator(ArtifactGenerator): """ Generator for logging FastPitch model outputs. @@ -339,10 +375,9 @@ def _generate_gta_predictions(self, model: LightningModule, audio_ids: List[str] ) if self.log_alignment: - # [B, T_spec, T_text] - attn = attn.squeeze(1) + attn = rearrange(attn, "B 1 T_spec T_text -> B T_text T_spec") for i, audio_id in enumerate(audio_ids): - attn_i = attn[i][: mels_pred_len[i], : text_lens[i]].cpu().numpy() + attn_i = attn[i][: text_lens[i], : mels_pred_len[i]].cpu().numpy() alignment_artifact = ImageArtifact( id=f"align_{audio_id}", data=attn_i, diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py index c454e6290477..73ad1e18a94b 100644 --- a/nemo/core/optim/lr_scheduler.py +++ b/nemo/core/optim/lr_scheduler.py @@ -975,5 +975,6 @@ def compute_max_steps( } EPOCH_SCHEDULERS = { + 'ExponentialLR': pt_scheduler.ExponentialLR, 'ReduceLROnPlateau': pt_scheduler.ReduceLROnPlateau, } diff --git a/scripts/dataset_processing/tts/preprocess_audio.py b/scripts/dataset_processing/tts/preprocess_audio.py index 9d2783ebff18..1912d45d4bed 100644 --- a/scripts/dataset_processing/tts/preprocess_audio.py +++ b/scripts/dataset_processing/tts/preprocess_audio.py @@ -24,7 +24,7 @@ Most of these can also be done by the TTS data loader at training time, but doing them ahead of time lets us implement more complex processing, validate the correctness of the output, and save on compute time. -$ python /scripts/dataset_processing/tts/audio_processing/preprocess_audio.py \ +$ python /scripts/dataset_processing/tts/preprocess_audio.py \ --input_manifest="/manifest.json" \ --output_manifest="/manifest_processed.json" \ --input_audio_dir="/audio" \ diff --git a/scripts/dataset_processing/tts/preprocess_text.py b/scripts/dataset_processing/tts/preprocess_text.py index 30893156ebfb..8b9bdebe940d 100644 --- a/scripts/dataset_processing/tts/preprocess_text.py +++ b/scripts/dataset_processing/tts/preprocess_text.py @@ -49,7 +49,9 @@ def get_args(): "--output_manifest", required=True, type=Path, help="Path to output training manifest with processed text.", ) parser.add_argument( - "--overwrite", default=False, type=bool, help="Whether to overwrite the output manifest file if it exists.", + "--overwrite", + action=argparse.BooleanOptionalAction, + help="Whether to overwrite the output manifest file if it exists.", ) parser.add_argument( "--lower_case", default=False, type=bool, help="Whether to convert the final text to lower case.", From 9827c9b11ffc6ef10d5eb22c6b5bbc1927d7e2ec Mon Sep 17 00:00:00 2001 From: Evelina <10428420+ekmb@users.noreply.github.com> Date: Fri, 2 Jun 2023 14:37:05 -0700 Subject: [PATCH 006/123] GPT inference long context (#6687) * deb infer Signed-off-by: Evelina * deb infer Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * clean up Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * dont do maxlen trunc for non abs pos emb Signed-off-by: Evelina * dont do maxlen trunc for non abs pos emb Signed-off-by: Evelina * convert for training only Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add eval test, add save .nemo for sft model Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * jenkins format fix Signed-off-by: Evelina * update jenkins Signed-off-by: Evelina * update jenkins Signed-off-by: Evelina * fix jenkins Signed-off-by: Evelina * remove test, ci timeout Signed-off-by: Evelina * fix for m_gpt_eval.py Signed-off-by: Evelina * jenkins test Signed-off-by: Evelina * fix gpt_eval with sft model Signed-off-by: Evelina * revert jenkins Signed-off-by: Evelina * keep float conversion for model.generate() Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix inference dtype Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Jenkinsfile | 25 ++++++++++++++++++- .../language_modeling/megatron_gpt_eval.py | 2 +- .../conf/megatron_gpt_peft_eval_config.yaml | 2 +- .../tuning/conf/megatron_gpt_sft.yaml | 2 +- .../tuning/megatron_gpt_sft.py | 7 ++++++ .../megatron/gpt_sft_dataset.py | 4 ++- .../megatron_gpt_sft_model.py | 8 +++++- .../nlp/modules/common/megatron/module.py | 2 +- .../common/text_generation_strategy.py | 7 ++++-- .../modules/common/text_generation_utils.py | 18 ++++++++----- 10 files changed, 62 insertions(+), 15 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index aacdd9575764..79c696a48600 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -3407,7 +3407,30 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' trainer.num_nodes=1" } } - + stage('L2: Megatron GPT SFT Eval (inference seq len > training seq len)') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + steps{ + sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \ + model.peft.restore_from_path=null \ + model.data.test_ds.file_names=['/home/TestData/nlp/megatron_gpt_sft/sample.jsonl'] \ + model.data.test_ds.names=['test'] \ + model.data.test_ds.global_batch_size=1 \ + model.data.test_ds.micro_batch_size=1 \ + model.data.test_ds.tokens_to_generate=30 \ + model.data.test_ds.max_seq_length=6000 \ + inference.greedy=True \ + inference.repetition_penalty=1.0 \ + inference.outfile_path='examples/nlp/language_modeling/out.jsonl' && \ + rm -rf examples/nlp/language_modeling/out.jsonl" + } + } stage('L2: Megatron GPT Prompt Tuning TP1 PP1') { when { anyOf { diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index 14cdbf8a760c..d7319fb72a01 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -263,7 +263,7 @@ def main(cfg) -> None: print(response) print("***************************") - # Second method of running text generation, call trainer.predict + # Second method of running text generation, call trainer.predict [recommended] ds = RequestDataSet(OmegaConf.to_container(cfg.prompts)) request_dl = DataLoader(dataset=ds, batch_size=2) config = OmegaConf.to_container(cfg.inference) diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml index c430bd7fab5f..69dc17f244f5 100755 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml @@ -129,4 +129,4 @@ inference: repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - outfile_path: /home/adithyare/exp/foo.txt \ No newline at end of file + outfile_path: output.txt \ No newline at end of file diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml index 678851db3b01..f8a8e6b9dbc0 100644 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml @@ -29,7 +29,7 @@ exp_manager: monitor: validation_${model.data.validation_ds.metric.name} save_top_k: 2 mode: max - save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below, + save_nemo_on_train_end: False filename: 'megatron_gpt_sft--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' model_parallel_size: ${model.tensor_model_parallel_size} save_best_model: True diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py index b2b8786df8c1..0737d55cc514 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py @@ -61,6 +61,8 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): gpt_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.0) gpt_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.0) gpt_cfg.ffn_dropout = cfg.model.ffn_dropout + sft_cls = MegatronGPTSFTModel + gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}" # This is needed when modifying a hparam file directly to load `.ckpt` files. # This is not needed to modify the cfg in `.nemo` files. @@ -167,6 +169,10 @@ def main(cfg) -> None: trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint) + # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams + with open_dict(cfg): + cfg.model.precision = cfg.trainer.precision + if cfg.model.restore_from_path: save_restore_connector = NLPSaveRestoreConnector() if os.path.isdir(cfg.model.restore_from_path): @@ -177,6 +183,7 @@ def main(cfg) -> None: return_config=True, save_restore_connector=save_restore_connector, ) + gpt_cfg = _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False) model = load_from_nemo(MegatronGPTSFTModel, cfg, trainer, gpt_cfg, modify_confg_fn=_modify_config) else: validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py index 24b7fe8d3d6d..f9ef6c8470c2 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py @@ -169,7 +169,9 @@ def _process_example(self, example): tokenized_text = pre_pad + self.tokenizer.text_to_ids(text) context_ids = pre_pad + self.tokenizer.text_to_ids(context) answer_ids = tokenized_text[len(context_ids) :] - total_ids = len(context_ids) + len(answer_ids) + + # for the long context cases, collate_fn includes self.tokens_to_generate for padding + total_ids = len(context_ids) + max(len(answer_ids), self.tokens_to_generate) if self.add_bos: total_ids += 1 if self.add_sep: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index 7819d28e8150..1dc335b86609 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -550,7 +550,13 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] return compute_prob_response else: del inference_config['compute_logprob'] - inference_config['inputs'] = (batch['contexts'].cuda(), batch['context_lengths'].cuda()) + + # for megatron_gpt_eval.py + if isinstance(batch, list): + inference_config['inputs'] = batch + else: + # peft_eval.py + inference_config['inputs'] = (batch['contexts'].cuda(), batch['context_lengths'].cuda()) return generate(self, **inference_config) def write_predictions_to_file(self, outputs, output_file_path_prefix): diff --git a/nemo/collections/nlp/modules/common/megatron/module.py b/nemo/collections/nlp/modules/common/megatron/module.py index 0a340985eec2..22a223013fd2 100644 --- a/nemo/collections/nlp/modules/common/megatron/module.py +++ b/nemo/collections/nlp/modules/common/megatron/module.py @@ -290,7 +290,7 @@ def forward(self, *inputs, **kwargs): if getattr(self.module, 'pre_process', True): inputs = fp32_to_float16(inputs, self.float16_converter) outputs = self.module(*inputs, **kwargs) - if parallel_state.is_pipeline_last_stage(): + if parallel_state.is_pipeline_last_stage() and self.training: outputs = float16_to_fp32(outputs) return outputs diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index 16935be1cc2d..27ae3b2606d3 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -181,8 +181,11 @@ def __init__(self, model): def clip_max_len(self, maxlen: int) -> int: """ clip the max len based on the LM model max sequence length""" - if maxlen > self.model.cfg.encoder_seq_length + 1: - maxlen = self.model.cfg.encoder_seq_length + 1 + + # for positional embedding types that allow length extrapolation, don't clip the max length + if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute": + if maxlen > self.model.cfg.encoder_seq_length + 1: + maxlen = self.model.cfg.encoder_seq_length + 1 return maxlen def init_batch(self, context_tokens: torch.Tensor, context_length: int): diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py index 8cfb02c5e321..3a07a807b11a 100644 --- a/nemo/collections/nlp/modules/common/text_generation_utils.py +++ b/nemo/collections/nlp/modules/common/text_generation_utils.py @@ -421,9 +421,15 @@ def synced_generate( if parallel_state.is_pipeline_first_stage(): src = parallel_state.get_pipeline_model_parallel_last_rank() group = parallel_state.get_embedding_group() - output_logits = torch.empty( - tokens.size(0), context_length - 1, dtype=torch.float32, device=torch.device("cuda") - ) + + precision = model._trainer.precision + if precision in [16, "16"]: + dtype = torch.float16 + elif precision == "bf16": + dtype = torch.bfloat16 + else: + dtype = torch.float32 + output_logits = torch.empty(tokens.size(0), context_length - 1, dtype=dtype, device=torch.device("cuda")) torch.distributed.broadcast(output_logits, src, group) if all_probs: @@ -433,7 +439,7 @@ def synced_generate( tokens.size(0), context_length - 1, model.padded_vocab_size, - dtype=torch.float32, + dtype=dtype, device=torch.device("cuda"), ) torch.distributed.broadcast(full_logits, src, group) @@ -667,10 +673,10 @@ def sample_sequence_batch( output = inference_strategy.forward_step(batch, tensor_shape) if parallel_state.is_pipeline_last_stage(): - output = output[0]['logits'].float() + output = output[0]['logits'] + output = tensor_parallel.gather_from_tensor_model_parallel_region(output) assert output is not None - output = output.float() logits = output[:, -1].view(batch_size, -1).contiguous() # make sure it will generate at least min_length From d5819e9cc0d3733d7aeeef9bde64c12200b6c415 Mon Sep 17 00:00:00 2001 From: Hainan Xu Date: Fri, 2 Jun 2023 18:08:29 -0400 Subject: [PATCH 007/123] TDT model pull request (#6536) * TDT model pull request, initial draft Signed-off-by: Hainan Xu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * TDT PR WIP Signed-off-by: Hainan Xu * TDT PR WIP Signed-off-by: Hainan Xu * TDT PR WIP Signed-off-by: Hainan Xu * TDT WIP Signed-off-by: Hainan Xu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * TDT WIP Signed-off-by: Hainan Xu * TDT WIP Signed-off-by: Hainan Xu * TDT WIP Signed-off-by: Hainan Xu * TDT WIP Signed-off-by: Hainan Xu * TDT WIP Signed-off-by: Hainan Xu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * TDT WIP Signed-off-by: Hainan Xu * TDT WIP Signed-off-by: Hainan Xu * TDT WIP Signed-off-by: Hainan Xu * TDT WIP Signed-off-by: Hainan Xu * TDT WIP Signed-off-by: Hainan Xu * addressed some review comments, part1 Signed-off-by: Hainan Xu * addressed some review comments, part1, one line fix Signed-off-by: Hainan Xu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add tests for comparing TDT alphas with pytorch VS kernel computation Signed-off-by: Hainan Xu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add tests for comparing multiblank alphas with pytorch VS kernel computation Signed-off-by: Hainan Xu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add tests for fixed case computation for TDT Signed-off-by: Hainan Xu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add more comments for greedy-batch decoding for TDT Signed-off-by: Hainan Xu * include config for TDT model with stateless decoders Signed-off-by: Hainan Xu * add reference to TDT in Readme Signed-off-by: Hainan Xu * slight modification of config file comments Signed-off-by: Hainan Xu * addressed more comments Signed-off-by: Hainan Xu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * more detailed comments for tdt kernel Signed-off-by: Hainan Xu * one line fix Signed-off-by: Hainan Xu * fixed small bug that results in test fails for rnnt_decoding Signed-off-by: Hainan Xu * fixed small bug that results in test fails for rnnt_decoding Signed-off-by: Hainan Xu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed small bug that results in test fails for rnnt_decoding Signed-off-by: Hainan Xu * remove unused import Signed-off-by: Hainan Xu --------- Signed-off-by: Hainan Xu Co-authored-by: Hainan Xu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- README.rst | 2 +- .../conformer_multiblank_transducer_bpe.yaml | 1 + .../conf/conformer/tdt/conformer_tdt_bpe.yaml | 281 +++++++++ .../tdt/conformer_tdt_bpe_stateless.yaml | 278 +++++++++ nemo/collections/asr/losses/rnnt.py | 50 +- nemo/collections/asr/losses/rnnt_pytorch.py | 138 ++++- nemo/collections/asr/metrics/rnnt_wer.py | 113 +++- nemo/collections/asr/metrics/rnnt_wer_bpe.py | 9 +- nemo/collections/asr/models/rnnt_models.py | 7 +- .../asr/parts/numba/rnnt_loss/__init__.py | 6 +- .../asr/parts/numba/rnnt_loss/rnnt.py | 127 ++++ .../asr/parts/numba/rnnt_loss/rnnt_pytorch.py | 226 ++++++- .../rnnt_loss/utils/cuda_utils/gpu_rnnt.py | 315 +++++++++- .../utils/cuda_utils/gpu_rnnt_kernel.py | 531 +++++++++++++++++ .../parts/submodules/rnnt_greedy_decoding.py | 561 ++++++++++++++++++ .../asr/decoding/test_rnnt_decoding.py | 2 +- .../asr/numba/rnnt_loss/test_rnnt_pytorch.py | 71 ++- .../rnnt_loss/utils/test_gpu_rnnt_kernel.py | 187 ++++++ .../asr/test_asr_rnnt_encdec_model.py | 44 ++ 19 files changed, 2887 insertions(+), 62 deletions(-) create mode 100644 examples/asr/conf/conformer/tdt/conformer_tdt_bpe.yaml create mode 100644 examples/asr/conf/conformer/tdt/conformer_tdt_bpe_stateless.yaml diff --git a/README.rst b/README.rst index 2e6f5580a3e5..b9ba7fce30f3 100644 --- a/README.rst +++ b/README.rst @@ -84,7 +84,7 @@ Key Features * CTC * Transducer/RNNT * Hybrid Transducer/CTC - * NeMo Original `Multi-blank Transducers `_ + * NeMo Original `Multi-blank Transducers `_ and `Token-and-Duration Transducers (TDT) `_ * Streaming/Buffered ASR (CTC/Transducer) - `Chunked Inference Examples `_ * Cache-aware Streaming Conformer - ``_ * Beam Search decoding diff --git a/examples/asr/conf/conformer/multiblank/conformer_multiblank_transducer_bpe.yaml b/examples/asr/conf/conformer/multiblank/conformer_multiblank_transducer_bpe.yaml index 84d767e4a3b5..51e57e72e2ad 100644 --- a/examples/asr/conf/conformer/multiblank/conformer_multiblank_transducer_bpe.yaml +++ b/examples/asr/conf/conformer/multiblank/conformer_multiblank_transducer_bpe.yaml @@ -179,6 +179,7 @@ model: decoding: strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + model_type: "multiblank" # this must not be None in order to use the multi-blank specific decoding method. # you could set this to [1, 1, 1] so that big blanks are treated the same diff --git a/examples/asr/conf/conformer/tdt/conformer_tdt_bpe.yaml b/examples/asr/conf/conformer/tdt/conformer_tdt_bpe.yaml new file mode 100644 index 000000000000..0210bd5a2dad --- /dev/null +++ b/examples/asr/conf/conformer/tdt/conformer_tdt_bpe.yaml @@ -0,0 +1,281 @@ +# This file contains the default values for training a Conformer-TDT ASR model, large size (~120M) with sub-word encoding. + +# You can find detailed info about TDT models at https://arxiv.org/abs/2304.06795. + +# Architecture and training config: +# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective +# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. +# Here are the recommended configs for different variants of Conformer-Transducer, other parameters are the same as in this config file. + +# Note: the added duration outputs from the joiner make TDT models slightly larger than corresponding conventional RNN-T models, +# although the difference is tiny -- the added number of params is roughly num-durations X (joint_hidden + pred_hidden), typically in the +# order of thousands of params. This is negligible even with the "Small" config with around 14 million params. +# Recommended duraction config is [0, 1, 2, ... , n] where optimal n is usually between 4 and 8 depending on the dataset. + +# +--------------+---------+---------+----------+------------------+--------------+--------------------------+-----------------+ +# | Model | d_model | n_heads | n_layers | conv_kernel_size | weight_decay | pred_hidden/joint_hidden | pred_rnn_layers | +# +==============+=========+========+===========+==================+==============+==========================+=================+ +# | Small (14M)| 176 | 4 | 16 | 31 | 0.0 | 320 | 1 | +# +--------------+---------+--------+-----------+------------------+--------------+--------------------------+-----------------+ +# | Medium (32M)| 256 | 4 | 16 | 31 | 1e-3 | 640 | 1 | +# +--------------+---------+--------+-----------+------------------+--------------+--------------------------+-----------------+ +# | Large (120M)| 512 | 8 | 17 | 31 | 1e-3 | 640 | 1 | +# +--------------+---------+--------+-----------+------------------+--------------+--------------------------+-----------------+ +# | XLarge (644M)| 1024 | 8 | 24 | 5 | 1e-3 | 640 | 2 | +# +--------------+---------+--------+-----------+------------------+--------------+--------------------------+-----------------+ + +# Default learning parameters in this config are set for global batch size of 2K while you may use lower values. +# To increase the global batch size with limited number of GPUs, you may use higher accumulate_grad_batches. +# However accumulate_grad_batches is better to be avoided as long as the global batch size is large enough and training is stable. + +name: "Conformer-TDT-BPE" + +model: + sample_rate: 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + # variables for TDT configs. + tdt_durations: [0, 1, 2, 3, 4] + num_tdt_durations: 5 + + + train_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: true + num_workers: 8 + pin_memory: true + use_start_end_token: false + trim_silence: false + max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset + min_duration: 0.1 + # tarred datasets + is_tarred: false + tarred_audio_filepaths: null + shuffle_n: 2048 + # bucketing params + bucketing_strategy: "synced_randomized" + bucketing_batch_size: null + + validation_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + num_workers: 8 + pin_memory: true + use_start_end_token: false + + test_ds: + manifest_filepath: null + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + num_workers: 8 + pin_memory: true + use_start_end_token: false + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "per_feature" + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + + # Sub-sampling params + subsampling: striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 4 # must be power of 2 for striding and vggnet + subsampling_conv_channels: -1 # set to -1 to make it equal to the d_model + causal_downsampling: false + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + att_context_size: [-1, -1] # -1 means unlimited context + att_context_style: regular # regular or chunked_limited + xscaling: true # scales up the input embeddings by sqrt(d_model) + untie_biases: true # unties the biases of the TransformerXL layers + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 31 + conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + conv_context_size: null + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 16 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + num_extra_outputs: ${model.model_defaults.num_tdt_durations} + + decoding: + # Using greedy decoding is highly recommended for TDT models. Using greedy-batch will give very bad results + # if omega is 0; even if omega is non-zero, greedy-batch results are still going to be inaccurate. + strategy: "greedy" + + model_type: "tdt" + + # this must not be None in order to use the TDT specific decoding method. + durations: ${model.model_defaults.tdt_durations} + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + loss: + # This is the main different between a TDT model and a conventional RNNT model -- the loss function. + loss_name: "tdt" + + tdt_kwargs: + # FastEmit regularization: https://arxiv.org/abs/2010.11148 + # You may enable FastEmit to reduce the latency of the model for streaming + fastemit_lambda: 0.001 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. + + # refer to https://arxiv.org/abs/2304.06795 for the meaning of the following three configs. + durations: ${model.model_defaults.tdt_durations} + sigma: 0.05 # hyper-param for under-normalization. + omega: 0.1 # weight for regular RNN-T loss. + + # Adds Gaussian noise to the gradients of the decoder to avoid overfitting + variational_noise: + start_step: 0 + std: 0.0 + + optim: + name: adamw + lr: 5.0 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing + d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: 500 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 0.0 + precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null + diff --git a/examples/asr/conf/conformer/tdt/conformer_tdt_bpe_stateless.yaml b/examples/asr/conf/conformer/tdt/conformer_tdt_bpe_stateless.yaml new file mode 100644 index 000000000000..fefbd6f8f56c --- /dev/null +++ b/examples/asr/conf/conformer/tdt/conformer_tdt_bpe_stateless.yaml @@ -0,0 +1,278 @@ +# This file contains the default values for training an TDT Conformer-Transducer ASR model, large size (~120M) with sub-word encoding. + +# You can find detailed info about TDT models at https://arxiv.org/abs/2304.06795. + +# Architecture and training config: +# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective +# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. +# Here are the recommended configs for different variants of Conformer-Transducer, other parameters are the same as in this config file. + +# Note: the added duration outputs from the joiner make TDT models slightly larger than corresponding conventional RNN-T models, +# although the difference is tiny -- the added number of params is roughly num-durations X (joint_hidden + pred_hidden), typically in the +# order of thousands of params. This is negligible even with the "Small" config with around 14 million params. +# Recommended duraction config is [0, 1, 2, ... , n] where optimal n is usually between 4 and 8 depending on the dataset. + +# +--------------+---------+---------+----------+------------------+--------------+--------------------------+-----------------+ +# | Model | d_model | n_heads | n_layers | conv_kernel_size | weight_decay | pred_hidden/joint_hidden | decoder_context | +# +==============+=========+========+===========+==================+==============+==========================+=================+ +# | Large (117M)| 512 | 8 | 17 | 31 | 1e-3 | 640 | 2 | +# +--------------+---------+--------+-----------+------------------+--------------+--------------------------+-----------------+ + +# Default learning parameters in this config are set for global batch size of 2K while you may use lower values. +# To increase the global batch size with limited number of GPUs, you may use higher accumulate_grad_batches. +# However accumulate_grad_batches is better to be avoided as long as the global batch size is large enough and training is stable. + + +name: "Conformer-TDT-BPE-Stateless" + +model: + sample_rate: 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + # variables for TDT configs. + tdt_durations: [0, 1, 2, 3, 4] + num_tdt_durations: 5 + + + train_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: true + num_workers: 8 + pin_memory: true + use_start_end_token: false + trim_silence: false + max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset + min_duration: 0.1 + # tarred datasets + is_tarred: false + tarred_audio_filepaths: null + shuffle_n: 2048 + # bucketing params + bucketing_strategy: "synced_randomized" + bucketing_batch_size: null + + validation_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + num_workers: 8 + pin_memory: true + use_start_end_token: false + + test_ds: + manifest_filepath: null + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + num_workers: 8 + pin_memory: true + use_start_end_token: false + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "per_feature" + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + + # Sub-sampling params + subsampling: striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 4 # must be power of 2 for striding and vggnet + subsampling_conv_channels: -1 # set to -1 to make it equal to the d_model + causal_downsampling: false + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + att_context_size: [-1, -1] # -1 means unlimited context + att_context_style: regular # regular or chunked_limited + xscaling: true # scales up the input embeddings by sqrt(d_model) + untie_biases: true # unties the biases of the TransformerXL layers + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 31 + conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + conv_context_size: null + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + decoder: + _target_: nemo.collections.asr.modules.StatelessTransducerDecoder + context_size: 2 # The Stateless decoder uses 2 words as context by default. + normalization_mode: layer # This helps stabilize training for Stateless decoders. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 16 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + # this variable is non-zero for this TDT model, as well as multi-blank models. It represents the number of + # additional outputs from the joiner, besides all tokens in the BPE vocab plus the (standard) blank symbol. + num_extra_outputs: ${model.model_defaults.num_tdt_durations} + + decoding: + # Using greedy decoding is highly recommended for TDT models. Using greedy-batch will give very bad results + # if omega is 0; even if omega is non-zero, greedy-batch results are still going to be inaccurate. + strategy: "greedy" + + model_type: "tdt" + + # this must not be None in order to use the TDT specific decoding method. + durations: ${model.model_defaults.tdt_durations} + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + loss: + # This is the main different between a TDT model and a conventional RNNT model -- the loss function. + loss_name: "tdt" + + tdt_kwargs: + # FastEmit regularization: https://arxiv.org/abs/2010.11148 + # You may enable FastEmit to reduce the latency of the model for streaming + fastemit_lambda: 0.001 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. + + # refer to https://arxiv.org/abs/2304.06795 for the meaning of the following three configs. + durations: ${model.model_defaults.tdt_durations} + sigma: 0.05 # hyper-param for under-normalization. + omega: 0.1 # weight for regular RNN-T loss. + + # Adds Gaussian noise to the gradients of the decoder to avoid overfitting + variational_noise: + start_step: 0 + std: 0.0 + + optim: + name: adamw + lr: 5.0 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing + d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: 500 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 0.0 + precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null + diff --git a/nemo/collections/asr/losses/rnnt.py b/nemo/collections/asr/losses/rnnt.py index 63381f82ee4a..10b85acb42ef 100644 --- a/nemo/collections/asr/losses/rnnt.py +++ b/nemo/collections/asr/losses/rnnt.py @@ -35,7 +35,7 @@ import torch from omegaconf import DictConfig, OmegaConf -from nemo.collections.asr.losses.rnnt_pytorch import MultiblankRNNTLossPytorch, RNNTLossPytorch +from nemo.collections.asr.losses.rnnt_pytorch import MultiblankRNNTLossPytorch, RNNTLossPytorch, TDTLossPytorch from nemo.core.classes import Loss, typecheck from nemo.core.neural_types import LabelsType, LengthsType, LogprobsType, LossType, NeuralType from nemo.core.utils.k2_utils import K2_INSTALLATION_MESSAGE @@ -50,7 +50,7 @@ WARP_RNNT_AVAILABLE = False try: - from nemo.collections.asr.parts.numba.rnnt_loss import MultiblankRNNTLossNumba, RNNTLossNumba + from nemo.collections.asr.parts.numba.rnnt_loss import MultiblankRNNTLossNumba, RNNTLossNumba, TDTLossNumba NUMBA_RNNT_AVAILABLE = True except (ImportError, ModuleNotFoundError): @@ -138,6 +138,20 @@ class RNNTLossConfig: installation_msg=K2_INSTALLATION_MESSAGE, force_float32=False, ), + "tdt": RNNTLossConfig( + loss_name="tdt", + lib_name="numba", + min_version='0.53.0', + is_available=NUMBA_RNNT_AVAILABLE, + installation_msg=NUMBA_INSTALLATION_MESSAGE, + ), + "tdt_pytorch": RNNTLossConfig( + loss_name="tdt_pytorch", + lib_name="torch", + min_version='0.0', + is_available=True, + installation_msg="Pure Pytorch implementation of TDT loss. Slow and for debugging purposes only.", + ), } RNNT_LOSS_RESOLVER['default'] = RNNT_LOSS_RESOLVER['warprnnt_numba'] @@ -274,6 +288,30 @@ def resolve_rnnt_loss(loss_name: str, blank_idx: int, loss_kwargs: dict = None) blank=blank_idx, big_blank_durations=big_blank_durations, reduction='none', sigma=sigma ) _warn_unused_additional_kwargs(loss_name, loss_kwargs) + + elif loss_name == 'tdt': + fastemit_lambda = loss_kwargs.pop('fastemit_lambda', 0.0) + clamp = loss_kwargs.pop('clamp', -1.0) + durations = loss_kwargs.pop('durations', None) + sigma = loss_kwargs.pop('sigma', 0.0) + omega = loss_kwargs.pop('omega', 0.0) + loss_func = TDTLossNumba( + blank=blank_idx, + durations=durations, + reduction='none', + fastemit_lambda=fastemit_lambda, + clamp=clamp, + sigma=sigma, + omega=omega, + ) + _warn_unused_additional_kwargs(loss_name, loss_kwargs) + + elif loss_name == 'tdt_pytorch': + durations = loss_kwargs.pop('durations', None) + sigma = loss_kwargs.pop('sigma', 0.0) + loss_func = TDTLossPytorch(blank=blank_idx, durations=durations, reduction='none', sigma=sigma) + _warn_unused_additional_kwargs(loss_name, loss_kwargs) + elif loss_name == "graph_rnnt": loss_kwargs = _clean_kwargs(loss_name, loss_kwargs, GraphRnntLoss.__init__, ignore_params={"blank"}) loss_func = GraphRnntLoss(blank=blank_idx, **loss_kwargs) @@ -345,7 +383,13 @@ def __init__(self, num_classes, reduction: str = 'mean_batch', loss_name: str = Args: num_classes: Number of target classes for the joint network to predict. - (Excluding the RNN-T blank token). + In all cases (conventional RNNT, multi-blank RNNT, and TDT model), this equals the token-id + for the standard "blank" symbol. In particular, say V is the number of non-blank tokens in + the vocabulary, then in the case of, + standard RNNT: num_classes = V + multiblank RNNT: num_classes = V + number-big-blanks (since we store big-blanks before + standard blank, and the standard blank is the last symbol in the vocab) + TDT: num_classes = V. Note, V here does not include any of the "duration outputs". reduction: Type of reduction to perform on loss. Possible values are `mean_batch`, 'mean_volume`, `mean`, `sum` or None. diff --git a/nemo/collections/asr/losses/rnnt_pytorch.py b/nemo/collections/asr/losses/rnnt_pytorch.py index ab0b5cf4f630..bc6e5a25a3b2 100644 --- a/nemo/collections/asr/losses/rnnt_pytorch.py +++ b/nemo/collections/asr/losses/rnnt_pytorch.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import List + import torch from nemo.core.classes import Loss @@ -112,6 +114,136 @@ def compute_forward_prob(self, acts, labels, act_lens, label_lens): return log_prob +class TDTLossPytorch(Loss): + """ + Pure Python implementation of TDT loss (https://arxiv.org/pdf/2304.06795.pdf) + """ + + @property + def input_types(self): + """Input types definitions for CTCLoss. + """ + return { + "acts": NeuralType(('B', 'T', 'T', 'D'), LogprobsType()), + "labels": NeuralType(('B', 'T'), LabelsType()), + "act_lens": NeuralType(tuple('B'), LengthsType()), + "label_lens": NeuralType(tuple('B'), LengthsType()), + } + + @property + def output_types(self): + """Output types definitions for CTCLoss. + loss: + NeuralType(None) + """ + return {"loss": NeuralType(elements_type=LossType())} + + def __init__(self, blank: int, durations: List[int] = [], reduction: str = 'sum', sigma: float = 0.0): + super().__init__() + self.blank = blank + self.durations = durations + self.n_durations = len(durations) + self.reduction = reduction + self.sigma = sigma + + def forward(self, acts, labels, act_lens, label_lens): + label_acts = acts[:, :, :, : -self.n_durations] + duration_acts = acts[:, :, :, -self.n_durations :] + + # the - self.sigma here is for logit-undernormalization. Check the paper for details. + label_acts = torch.log_softmax(label_acts, -1) - self.sigma + + duration_acts = torch.log_softmax(duration_acts, -1) + + forward_logprob, _ = self.compute_forward_prob(label_acts, duration_acts, labels, act_lens, label_lens) + losses = -forward_logprob + if self.reduction == 'mean_batch': + losses = losses.mean() # global batch size average + elif self.reduction == 'mean': + losses = torch.div(losses, label_lens).mean() + elif self.reduction == 'sum': + losses = losses.sum() + elif self.reduction == 'mean_volume': + losses = losses.sum() / label_lens.sum() # same as above but longer samples weigh more + + return losses + + def logsumexp(self, a, b): + ret = torch.logsumexp(torch.stack([a, b]), dim=0) + return ret + + def compute_forward_prob(self, acts, duration_acts, labels, act_lens, label_lens): + """This function implements Equation 7 in the TDT paper https://arxiv.org/pdf/2304.06795.pdf, + Simply put, for each alpha(t, u), it sums over the contribution from all incoming blank arcs and non-blank arcs. + """ + B, T, U, _ = acts.shape + + log_alpha = torch.zeros(B, T, U) + log_alpha = log_alpha.cuda() + for b in range(B): + for t in range(T): + for u in range(U): + if u == 0: + if t == 0: + # both t and u are 0, this is the base case for alphas. + log_alpha[b, t, u] = 0.0 + else: + # u = 0 and t != 0: only considers blank emissions. + log_alpha[b, t, u] = -1000.0 + for n, l in enumerate(self.durations): + if ( + t - l >= 0 and l > 0 + ): # checking conditions for blank emission, l has to be at least 1 + tmp = ( + log_alpha[b, t - l, u] + + acts[b, t - l, u, self.blank] + + duration_acts[b, t - l, u, n] + ) + log_alpha[b, t, u] = self.logsumexp(tmp, 1.0 * log_alpha[b, t, u]) + + else: + # u != 0 here, need to consider both blanks and non-blanks. + log_alpha[b, t, u] = -1000.0 + for n, l in enumerate(self.durations): + if t - l >= 0: + if l > 0: # for blank emissions. Need to ensure index is not out-of-bound. + tmp = ( + log_alpha[b, t - l, u] + + acts[b, t - l, u, self.blank] + + duration_acts[b, t - l, u, n] + ) + log_alpha[b, t, u] = self.logsumexp(tmp, 1.0 * log_alpha[b, t, u]) + + # non-blank emissions. + tmp = ( + log_alpha[b, t - l, u - 1] + + acts[b, t - l, u - 1, labels[b, u - 1]] + + duration_acts[b, t - l, u - 1, n] + ) + log_alpha[b, t, u] = self.logsumexp(tmp, 1.0 * log_alpha[b, t, u]) + + log_probs = [] + for b in range(B): + tt = torch.Tensor([-1000.0]).cuda()[0] + + # need to loop over all possible ways that blank with different durations contributes to the final loss. + for n, l in enumerate(self.durations): + if act_lens[b] - l >= 0 and l > 0: + bb = ( + log_alpha[b, act_lens[b] - l, label_lens[b]] + + acts[b, act_lens[b] - l, label_lens[b], self.blank] + + duration_acts[b, act_lens[b] - l, label_lens[b], n] + ) + + tt = self.logsumexp(bb, 1.0 * tt) + + log_probs.append(tt) + + log_prob = torch.stack(log_probs) + + return log_prob, log_alpha + + class MultiblankRNNTLossPytorch(Loss): """ Pure Python implementation of multi-blank transducer loss (https://arxiv.org/pdf/2211.03541.pdf) @@ -136,7 +268,7 @@ def output_types(self): """ return {"loss": NeuralType(elements_type=LossType())} - def __init__(self, blank, big_blank_durations, reduction, sigma): + def __init__(self, blank, big_blank_durations, reduction: str = "sum", sigma: float = 0.0): super().__init__() self.blank = blank self.big_blank_durations = big_blank_durations @@ -145,7 +277,7 @@ def __init__(self, blank, big_blank_durations, reduction, sigma): def forward(self, acts, labels, act_lens, label_lens): acts = torch.log_softmax(acts, -1) - self.sigma - forward_logprob = self.compute_forward_prob(acts, labels, act_lens, label_lens) + forward_logprob, _ = self.compute_forward_prob(acts, labels, act_lens, label_lens) losses = -forward_logprob if self.reduction == 'mean_batch': @@ -234,4 +366,4 @@ def compute_forward_prob(self, acts, labels, act_lens, label_lens): log_probs.append(to_append) log_prob = torch.stack(log_probs) - return log_prob + return log_prob, log_alpha diff --git a/nemo/collections/asr/metrics/rnnt_wer.py b/nemo/collections/asr/metrics/rnnt_wer.py index 1ccc2d0ac6fc..55f9f4b5ea9f 100644 --- a/nemo/collections/asr/metrics/rnnt_wer.py +++ b/nemo/collections/asr/metrics/rnnt_wer.py @@ -204,6 +204,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int): self.blank_id = blank_id self.num_extra_outputs = joint.num_extra_outputs self.big_blank_durations = self.cfg.get("big_blank_durations", None) + self.durations = self.cfg.get("durations", None) self.compute_hypothesis_token_set = self.cfg.get("compute_hypothesis_token_set", False) self.compute_langs = decoding_cfg.get('compute_langs', False) self.preserve_alignments = self.cfg.get('preserve_alignments', None) @@ -211,9 +212,21 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int): self.compute_timestamps = self.cfg.get('compute_timestamps', None) self.word_seperator = self.cfg.get('word_seperator', ' ') - if self.big_blank_durations is not None: + if self.durations is not None: # this means it's a TDT model. + if blank_id == 0: + raise ValueError("blank_id must equal len(non_blank_vocabs) for TDT models") + if self.big_blank_durations is not None: + raise ValueError("duration and big_blank_durations can't both be not None") + if self.cfg.strategy not in ['greedy', 'greedy_batch']: + raise ValueError("currently only greedy and greedy_batch inference is supported for TDT models") + + if self.big_blank_durations is not None: # this means it's a multi-blank model. if blank_id == 0: raise ValueError("blank_id must equal len(vocabs) for multi-blank RNN-T models") + if self.cfg.strategy not in ['greedy', 'greedy_batch']: + raise ValueError( + "currently only greedy and greedy_batch inference is supported for multi-blank models" + ) possible_strategies = ['greedy', 'greedy_batch', 'beam', 'tsd', 'alsd', 'maes'] if self.cfg.strategy not in possible_strategies: @@ -254,17 +267,33 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int): if self.cfg.strategy == 'greedy': if self.big_blank_durations is None: - self.decoding = greedy_decode.GreedyRNNTInfer( - decoder_model=decoder, - joint_model=joint, - blank_index=self.blank_id, - max_symbols_per_step=( - self.cfg.greedy.get('max_symbols', None) or self.cfg.greedy.get('max_symbols_per_step', None) - ), - preserve_alignments=self.preserve_alignments, - preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, - ) + if self.durations is None: + self.decoding = greedy_decode.GreedyRNNTInfer( + decoder_model=decoder, + joint_model=joint, + blank_index=self.blank_id, + max_symbols_per_step=( + self.cfg.greedy.get('max_symbols', None) + or self.cfg.greedy.get('max_symbols_per_step', None) + ), + preserve_alignments=self.preserve_alignments, + preserve_frame_confidence=self.preserve_frame_confidence, + confidence_method_cfg=self.confidence_method_cfg, + ) + else: + self.decoding = greedy_decode.GreedyTDTInfer( + decoder_model=decoder, + joint_model=joint, + blank_index=self.blank_id, + durations=self.durations, + max_symbols_per_step=( + self.cfg.greedy.get('max_symbols', None) + or self.cfg.greedy.get('max_symbols_per_step', None) + ), + preserve_alignments=self.preserve_alignments, + preserve_frame_confidence=self.preserve_frame_confidence, + confidence_method_cfg=self.confidence_method_cfg, + ) else: self.decoding = greedy_decode.GreedyMultiblankRNNTInfer( decoder_model=decoder, @@ -281,17 +310,34 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int): elif self.cfg.strategy == 'greedy_batch': if self.big_blank_durations is None: - self.decoding = greedy_decode.GreedyBatchedRNNTInfer( - decoder_model=decoder, - joint_model=joint, - blank_index=self.blank_id, - max_symbols_per_step=( - self.cfg.greedy.get('max_symbols', None) or self.cfg.greedy.get('max_symbols_per_step', None) - ), - preserve_alignments=self.preserve_alignments, - preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, - ) + if self.durations is None: + self.decoding = greedy_decode.GreedyBatchedRNNTInfer( + decoder_model=decoder, + joint_model=joint, + blank_index=self.blank_id, + max_symbols_per_step=( + self.cfg.greedy.get('max_symbols', None) + or self.cfg.greedy.get('max_symbols_per_step', None) + ), + preserve_alignments=self.preserve_alignments, + preserve_frame_confidence=self.preserve_frame_confidence, + confidence_method_cfg=self.confidence_method_cfg, + ) + else: + self.decoding = greedy_decode.GreedyBatchedTDTInfer( + decoder_model=decoder, + joint_model=joint, + blank_index=self.blank_id, + durations=self.durations, + max_symbols_per_step=( + self.cfg.greedy.get('max_symbols', None) + or self.cfg.greedy.get('max_symbols_per_step', None) + ), + preserve_alignments=self.preserve_alignments, + preserve_frame_confidence=self.preserve_frame_confidence, + confidence_method_cfg=self.confidence_method_cfg, + ) + else: self.decoding = greedy_decode.GreedyBatchedMultiblankRNNTInfer( decoder_model=decoder, @@ -481,12 +527,12 @@ def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hyp # RNN-T sample level is already preprocessed by implicit RNNT decoding # Simply remove any blank and possibly big blank tokens - if self.blank_id != 0: - num_extra_outputs = 0 - if self.big_blank_durations is not None: - num_extra_outputs += len(self.big_blank_durations) + if self.big_blank_durations is not None: # multi-blank RNNT + num_extra_outputs = len(self.big_blank_durations) prediction = [p for p in prediction if p < self.blank_id - num_extra_outputs] - else: + elif self.durations is not None: # TDT model. + prediction = [p for p in prediction if p < self.blank_id] + else: # standard RNN-T prediction = [p for p in prediction if p != self.blank_id] # De-tokenize the integer tokens; if not computing timestamps @@ -1058,9 +1104,12 @@ class RNNTDecoding(AbstractRNNTDecoding): def __init__( self, decoding_cfg, decoder, joint, vocabulary, ): - blank_id = ( - len(vocabulary) + joint.num_extra_outputs - ) # we need to ensure blank is the last token in the vocab. This is needed for multi-blank RNN-T models. + # we need to ensure blank is the last token in the vocab for the case of RNNT and Multi-blank RNNT. + blank_id = len(vocabulary) + joint.num_extra_outputs + + if hasattr(decoding_cfg, 'model_type') and decoding_cfg.model_type == 'tdt': + blank_id = len(vocabulary) + self.labels_map = dict([(i, vocabulary[i]) for i in range(len(vocabulary))]) super(RNNTDecoding, self).__init__( @@ -1239,7 +1288,9 @@ def compute(self): @dataclass class RNNTDecodingConfig: + model_type: str = "rnnt" # one of "rnnt", "multiblank" or "tdt" strategy: str = "greedy_batch" + compute_hypothesis_token_set: bool = False # preserve decoding alignments diff --git a/nemo/collections/asr/metrics/rnnt_wer_bpe.py b/nemo/collections/asr/metrics/rnnt_wer_bpe.py index 99c71daebaa9..0870eb180776 100644 --- a/nemo/collections/asr/metrics/rnnt_wer_bpe.py +++ b/nemo/collections/asr/metrics/rnnt_wer_bpe.py @@ -196,11 +196,16 @@ class RNNTBPEDecoding(AbstractRNNTDecoding): """ def __init__(self, decoding_cfg, decoder, joint, tokenizer: TokenizerSpec): - blank_id = tokenizer.tokenizer.vocab_size + blank_id = tokenizer.tokenizer.vocab_size # RNNT or TDT models. + + # multi-blank RNNTs + if hasattr(decoding_cfg, 'model_type') and decoding_cfg.model_type == 'multiblank': + blank_id = tokenizer.tokenizer.vocab_size + joint.num_extra_outputs + self.tokenizer = tokenizer super(RNNTBPEDecoding, self).__init__( - decoding_cfg=decoding_cfg, decoder=decoder, joint=joint, blank_id=blank_id + joint.num_extra_outputs + decoding_cfg=decoding_cfg, decoder=decoder, joint=joint, blank_id=blank_id ) if isinstance(self.decoding, rnnt_beam_decoding.BeamRNNTInfer): diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index 84e08635834d..eec663813ca8 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -71,8 +71,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): # Setup RNNT Loss loss_name, loss_kwargs = self.extract_rnnt_loss_cfg(self.cfg.get("loss", None)) + num_classes = self.joint.num_classes_with_blank - 1 # for standard RNNT and multi-blank + + if loss_name == 'tdt': + num_classes = num_classes - self.joint.num_extra_outputs + self.loss = RNNTLoss( - num_classes=self.joint.num_classes_with_blank - 1, + num_classes=num_classes, loss_name=loss_name, loss_kwargs=loss_kwargs, reduction=self.cfg.get("rnnt_reduction", "mean_batch"), diff --git a/nemo/collections/asr/parts/numba/rnnt_loss/__init__.py b/nemo/collections/asr/parts/numba/rnnt_loss/__init__.py index 66e30c77590a..055d7aeb5fd9 100644 --- a/nemo/collections/asr/parts/numba/rnnt_loss/__init__.py +++ b/nemo/collections/asr/parts/numba/rnnt_loss/__init__.py @@ -13,4 +13,8 @@ # limitations under the License. from nemo.collections.asr.parts.numba.rnnt_loss.rnnt import rnnt_loss_cpu, rnnt_loss_gpu -from nemo.collections.asr.parts.numba.rnnt_loss.rnnt_pytorch import MultiblankRNNTLossNumba, RNNTLossNumba +from nemo.collections.asr.parts.numba.rnnt_loss.rnnt_pytorch import ( + MultiblankRNNTLossNumba, + RNNTLossNumba, + TDTLossNumba, +) diff --git a/nemo/collections/asr/parts/numba/rnnt_loss/rnnt.py b/nemo/collections/asr/parts/numba/rnnt_loss/rnnt.py index 64c8955006ed..118ee88acbfe 100644 --- a/nemo/collections/asr/parts/numba/rnnt_loss/rnnt.py +++ b/nemo/collections/asr/parts/numba/rnnt_loss/rnnt.py @@ -236,6 +236,133 @@ def rnnt_loss_gpu( return True +def tdt_loss_gpu( + label_acts: torch.Tensor, + duration_acts: torch.Tensor, + labels: torch.Tensor, + input_lengths: torch.Tensor, + label_lengths: torch.Tensor, + costs: torch.Tensor, + label_grads: torch.Tensor, + duration_grads: torch.Tensor, + blank_label: int, + durations: list, + fastemit_lambda: float, + clamp: float, + num_threads: int, + sigma: float, + omega: float, +): + """ + Wrapper method for accessing GPU TDT loss (https://arxiv.org/abs/2304.06795). + + CUDA implementation ported from [HawkAaron/warp-transducer](https://github.com/HawkAaron/warp-transducer). + + Args: + label_acts: Activation tensor of shape [B, T, U, V], where V includes the blank symbol. + duration_acts: Activation tensor of shape [B, T, U, D], where D is the number of durations. + labels: Ground truth labels of shape [B, U]. + input_lengths: Lengths of the acoustic sequence as a vector of ints [B]. + label_lengths: Lengths of the target sequence as a vector of ints [B]. + costs: Zero vector of length [B] in which costs will be set. + label_grads: Zero tensor of shape [B, T, U, V] where the gradient to label_acts will be set. + duration_grads: Zero tensor of shape [B, T, U, D] where the gradient to duration_acts will be set. + blank_label: Index of the standard blank token in the vocabulary. + durations: A list of supported durations for TDT. Must include 0 and 1. + fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to + FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. + clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. + num_threads: Number of threads for OpenMP. + sigma: logit-undernormalization weight used in the multi-blank model. Refer to + the multi-blank paper https://arxiv.org/abs/2304.06795 for detailed explanations. + omega: weight for regular RNN-T loss + """ + minibatch_size = label_acts.shape[0] + maxT = label_acts.shape[1] + maxU = label_acts.shape[2] + alphabet_size = label_acts.shape[3] + + if hasattr(cuda, 'external_stream'): + stream = cuda.external_stream(torch.cuda.current_stream(label_acts.device).cuda_stream) + else: + stream = cuda.default_stream() + + if num_threads < 0: + num_threads = multiprocessing.cpu_count() + + num_threads = max(1, num_threads) # have to use at least 1 thread + + gpu_size, status = rnnt_helper.get_workspace_size(maxT, maxU, minibatch_size, gpu=True) + + if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: + raise RuntimeError("Invalid parameter passed when calculating working space memory") + + # Select GPU index + cuda.select_device(label_acts.device.index) + gpu_workspace = torch.zeros(gpu_size, device=label_acts.device, dtype=label_acts.dtype, requires_grad=False) + + tdt_workspace = torch.zeros(len(durations), device=label_acts.device, dtype=torch.long, requires_grad=False) + + for i in range(0, len(durations)): + tdt_workspace[i] = durations[i] + + ### VIEW TENSORS AS VECTORS FOR POINTER INDEXING ### + label_acts, label_acts_shape = rnnt_helper.flatten_tensor(label_acts) + duration_acts, duration_acts_shape = rnnt_helper.flatten_tensor(duration_acts) + + wrapper = gpu_rnnt.GPUTDT( + minibatch=minibatch_size, + maxT=maxT, + maxU=maxU, + alphabet_size=alphabet_size, + workspace=gpu_workspace, + tdt_workspace=tdt_workspace, + num_durations=len(durations), + blank=blank_label, + fastemit_lambda=fastemit_lambda, + clamp=clamp, + num_threads=num_threads, + stream=stream, + sigma=sigma, + omega=omega, + ) + + if label_grads is None: + status = wrapper.score_forward( + label_acts=label_acts.data, + duration_acts=duration_acts.data, + costs=costs.data, + pad_labels=labels.data, + label_lengths=label_lengths.data, + input_lengths=input_lengths.data, + ) + + if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: + raise RuntimeError("Could not calculate forward scores") + + else: + ### FLATTEN GRAD TENSOR ### + label_grads, label_grads_shape = rnnt_helper.flatten_tensor(label_grads) + duration_grads, duration_grads_shape = rnnt_helper.flatten_tensor(duration_grads) + + status = wrapper.cost_and_grad( + label_acts=label_acts.data, + duration_acts=duration_acts.data, + label_grads=label_grads.data, + duration_grads=duration_grads.data, + costs=costs.data, + pad_labels=labels.data, + label_lengths=label_lengths.data, + input_lengths=input_lengths.data, + ) + + if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: + raise RuntimeError("Could not calculate forward scores") + + del gpu_workspace, tdt_workspace, wrapper + return True + + def multiblank_rnnt_loss_gpu( acts: torch.Tensor, labels: torch.Tensor, diff --git a/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_pytorch.py b/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_pytorch.py index 3ed9b82bf996..2ffe08be361e 100644 --- a/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_pytorch.py +++ b/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_pytorch.py @@ -34,7 +34,7 @@ from nemo.collections.asr.parts.numba.rnnt_loss import rnnt from nemo.collections.asr.parts.numba.rnnt_loss.utils.cpu_utils import cpu_rnnt -__all__ = ['rnnt_loss', 'RNNTLossNumba', 'MultiblankRNNTLossNumba'] +__all__ = ['rnnt_loss', 'RNNTLossNumba', 'MultiblankRNNTLossNumba', 'TDTLossNumba'] class _RNNTNumba(Function): @@ -91,6 +91,111 @@ def backward(ctx, grad_output): return ctx.grads.mul_(grad_output), None, None, None, None, None, None, None +class _TDTNumba(Function): + """ + Numba class for Token-and-Duration Transducer (TDT) loss (https://arxiv.org/abs/2304.06795) + """ + + @staticmethod + def forward( + ctx, + label_acts, + duration_acts, + labels, + act_lens, + label_lens, + blank, + durations, + reduction, + fastemit_lambda, + clamp, + sigma, + omega, + ): + """ + log_probs: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network + labels: 2 dimensional Tensor containing all the targets of the batch with zero padded + act_lens: Tensor of size (batch) containing size of each output sequence from the network + label_lens: Tensor of (batch) containing label length of each example + fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to + FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. + + durations: list of durations for TDT model, must include 0 and 1, e.g. + [0, 1, 2, 3, 4]. + sigma: hyper-parameter for logit under-normalization method for training + TDT models. Recommended value 0.05. + omega: probability for sampling the standard RNN-T loss. + Refer to https://arxiv.org/abs/2304.06795 for detailed explanations for + the above parameters; + """ + is_cuda = label_acts.is_cuda + + certify_inputs(label_acts, labels, act_lens, label_lens) + if clamp < 0: + raise ValueError("`clamp` must be 0.0 or positive float value.") + + if is_cuda: + loss_func = rnnt.tdt_loss_gpu + else: + raise ValueError("TDT is not yet implemented for non CUDA computation.") + + label_grads = torch.zeros_like(label_acts) if label_acts.requires_grad else None + duration_grads = torch.zeros_like(duration_acts) if duration_acts.requires_grad else None + minibatch_size = label_acts.size(0) + costs = torch.zeros(minibatch_size, device=label_acts.device, dtype=label_acts.dtype) + + loss_func( + label_acts, + duration_acts, + labels=labels, + input_lengths=act_lens, + label_lengths=label_lens, + costs=costs, + label_grads=label_grads, + duration_grads=duration_grads, + blank_label=blank, + durations=durations, + fastemit_lambda=fastemit_lambda, + clamp=clamp, + sigma=sigma, + omega=omega, + num_threads=0, + ) + + if reduction in ['sum', 'mean']: + costs = costs.sum().unsqueeze_(-1) + if reduction == 'mean': + costs /= minibatch_size + + if label_grads is not None: + label_grads /= minibatch_size + duration_grads /= minibatch_size + + ctx.label_grads = label_grads + ctx.duration_grads = duration_grads + + return costs + + @staticmethod + def backward(ctx, grad_output): + if grad_output is not None and ctx.label_grads is not None: + grad_output = grad_output.view(-1, 1, 1, 1).to(ctx.label_grads) + return ( + ctx.label_grads.mul_(grad_output), + ctx.duration_grads.mul_(grad_output), + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) + + class _MultiblankRNNTNumba(Function): """ Numba class for multi-blank transducer loss (https://arxiv.org/pdf/2211.03541.pdf) @@ -237,6 +342,52 @@ def multiblank_rnnt_loss( ) +def tdt_loss( + acts, + labels, + act_lens, + label_lens, + blank, + durations=[], + reduction='mean', + fastemit_lambda: float = 0.0, + clamp: float = 0.0, +): + """ + TDT RNN Transducer (https://arxiv.org/abs/2304.06795) Loss (functional form) + Args: + acts: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network + labels: 2 dimensional Tensor containing all the targets of the batch with zero padded + act_lens: Tensor of size (batch) containing size of each output sequence from the network + label_lens: Tensor of (batch) containing label length of each example + blank (int): standard blank label. + durations: list of durations for TDT model, e.g. + [0,1,2,3,4]. + sigma: hyper-parameter for logit under-normalization method for training + multi-blank transducers. Recommended value 0.05. + Refer to https://arxiv.org/abs/2304.06795 for detailed explanations for + the last two params. + reduction (string, optional): Specifies the reduction to apply to the output: + 'none' | 'mean' | 'sum'. 'none': no reduction will be applied, + 'mean': the output losses will be divided by the target lengths and + then the mean over the batch is taken. Default: 'mean' + """ + if not acts.is_cuda: + # Since CPU requires log_softmax to be computed explicitly, we need to perform grad clipping + # *after* we have obtained the gradients of loss(logsoftmax()). + # This is highly wasteful since it requires a copy of the entire joint tensor which is expensive. + # CUDA version is much more efficient since it performs an inplace logsoftmax, and therefore + # can inplace clamp the gradient. + if clamp > 0.0: + acts = cpu_rnnt.LogSoftmaxGradModification.apply(acts, clamp) + + # NOTE manually done log_softmax for CPU version, + # log_softmax is computed within GPU version. + acts = torch.nn.functional.log_softmax(acts, -1) + + return _TDTNumba.apply(acts, labels, act_lens, label_lens, blank, durations, reduction, fastemit_lambda, clamp) + + class RNNTLossNumba(Module): """ Parameters: @@ -354,6 +505,79 @@ def forward(self, acts, labels, act_lens, label_lens): ) +class TDTLossNumba(Module): + """ + Parameters: + blank (int): standard blank label. + durations: list of durations for TDT model, e.g. + [0, 1, 2, 3, 4]. + sigma: hyper-parameter for logit under-normalization method for training + TDT. Recommended value 0.05. + omega: hyper-parameter for RNN-T loss for loss combination. + Refer to https://arxiv.org/abs/2304.06795 for detailed explanations for + the above parameters; + + reduction (string, optional): Specifies the reduction to apply to the output: + 'none' | 'mean' | 'sum'. 'none': no reduction will be applied, + 'mean': the output losses will be divided by the target lengths and + then the mean over the batch is taken. Default: 'mean' + fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to + FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. + clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. + """ + + def __init__( + self, + blank, + durations=None, + reduction='mean', + fastemit_lambda: float = 0.0, + clamp: float = -1, + sigma: float = 0.0, + omega: float = 0.0, + ): + super(TDTLossNumba, self).__init__() + self.blank = blank + self.durations = durations if durations is not None else [] + self.fastemit_lambda = fastemit_lambda + self.clamp = float(clamp) if clamp > 0 else 0.0 + self.reduction = reduction + self.loss = _TDTNumba.apply + self.sigma = sigma + self.omega = omega + + def forward(self, acts, labels, act_lens, label_lens): + """ + log_probs: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network + labels: 2 dimensional Tensor containing all the targets of the batch with zero padded + act_lens: Tensor of size (batch) containing size of each output sequence from the network + label_lens: Tensor of (batch) containing label length of each example + """ + + # TODO(hainan): in the future, we could further optimize this so that we don't need to + # make contiguous copies of the acts tensor. + label_acts, duration_acts = torch.split( + acts, [acts.shape[-1] - len(self.durations), len(self.durations)], dim=-1 + ) + label_acts = label_acts.contiguous() + duration_acts = torch.nn.functional.log_softmax(duration_acts, dim=-1).contiguous() + + return self.loss( + label_acts, + duration_acts, + labels, + act_lens, + label_lens, + self.blank, + self.durations, + self.reduction, + self.fastemit_lambda, + self.clamp, + self.sigma, + self.omega, + ) + + def check_type(var, t, name): if var.dtype is not t: raise TypeError("{} must be {}".format(name, t)) diff --git a/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt.py b/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt.py index dca4e732c062..70ffb459cb97 100644 --- a/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt.py +++ b/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt.py @@ -27,6 +27,7 @@ # limitations under the License. import multiprocessing +import random from typing import Optional, Tuple import numba @@ -499,24 +500,306 @@ def _prepare_workspace(self) -> (int, Tuple[torch.Tensor]): An int, representing the offset of the used workspace (practically, the slice of the workspace consumed) A tuple of tensors representing the shared workspace. """ - used_offset = 0 + used_offset, (denom, alphas, betas, llForward, llBackward) = super()._prepare_workspace() - # // denom - denom = self.gpu_workspace[used_offset : used_offset + self.maxT_ * self.maxU_ * self.minibatch_] - used_offset += self.maxT_ * self.maxU_ * self.minibatch_ + bigblank_durations = self.big_blank_workspace[: self.num_big_blanks] - # // alphas & betas - alphas = self.gpu_workspace[used_offset : used_offset + self.maxT_ * self.maxU_ * self.minibatch_] - used_offset += self.maxT_ * self.maxU_ * self.minibatch_ - betas = self.gpu_workspace[used_offset : used_offset + self.maxT_ * self.maxU_ * self.minibatch_] - used_offset += self.maxT_ * self.maxU_ * self.minibatch_ + return used_offset, (denom, alphas, betas, llForward, llBackward, bigblank_durations) - # // logllh - llForward = self.gpu_workspace[used_offset : used_offset + self.minibatch_] - used_offset += self.minibatch_ - llBackward = self.gpu_workspace[used_offset : used_offset + self.minibatch_] - used_offset += self.minibatch_ - bigblank_durations = self.big_blank_workspace[: self.num_big_blanks] +class GPUTDT(GPURNNT): + def __init__( + self, + sigma: float, + omega: float, + num_durations: int, + minibatch: int, + maxT: int, + maxU: int, + alphabet_size: int, + workspace, + tdt_workspace, + blank: int, + fastemit_lambda: float, + clamp: float, + num_threads: int, + stream, + ): + """ + Helper class to launch the CUDA Kernels to compute TDT Loss (https://arxiv.org/pdf/2211.03541). - return used_offset, (denom, alphas, betas, llForward, llBackward, bigblank_durations) + Args: + sigma: Hyper-parameter related to the logit-normalization method in training tdt transducers. + omega: Hyper-parameter related to the sampled training. + num_durations: Number of durations the model supports. + minibatch: Int representing the batch size. + maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. + maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. + alphabet_size: The vocabulary dimension V + 1 + num-big-blanks + workspace: An allocated chunk of memory that will be sliced off and reshaped into required + blocks used as working memory. + tdt_workspace: An allocated chunk of memory that will be sliced off and reshaped into required + blocks used as working memory specifically for the tdt related computations. + blank: Index of the blank token in the vocabulary. Must be the last token in the vocab. + fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to + FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. + clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. + num_threads: Number of OMP threads to launch. + stream: Numba Cuda Stream. + """ + super().__init__( + minibatch, maxT, maxU, alphabet_size, workspace, blank, fastemit_lambda, clamp, num_threads, stream + ) + self.tdt_workspace = cuda.as_cuda_array( + tdt_workspace + ) # a flat vector of integer numbers that represents allocated memory slices + + self.num_durations = num_durations + self.sigma = sigma + self.omega = omega + + def compute_cost_and_score( + self, + label_acts: torch.Tensor, + duration_acts: torch.Tensor, + label_grads: Optional[torch.Tensor], + duration_grads: Optional[torch.Tensor], + costs: torch.Tensor, + labels: torch.Tensor, + label_lengths: torch.Tensor, + input_lengths: torch.Tensor, + ) -> global_constants.RNNTStatus: + """ + Compute both the loss and the gradients. + + Args: + label_acts: A flattened tensor of shape [B, T, U, V] representing the activation matrix for tokens. + duration_acts: A flattened tensor of shape [B, T, U, D] representing the activation matrix for durations. + label_grad: A flattented zero tensor of same shape as label_acts. + duration_grad: A flattented zero tensor of same shape as duration_acts. + costs: A zero vector of length B which will be updated inplace with the log probability costs. + flat_labels: A flattened matrix of labels of shape [B, U] + label_lengths: A vector of length B that contains the original lengths of the acoustic sequence. + input_lengths: A vector of length B that contains the original lengths of the target sequence. + + Updates: + This will launch kernels that will update inline the following variables: + - *_grads: Gradients of the activation matrix wrt the costs vector. + - costs: Negative log likelihood of the forward variable. + + Returns: + An enum that either represents a successful RNNT operation or failure. + """ + training = label_grads is not None + + if training: + label_grads *= 0.0 # zero grads + duration_grads *= 0.0 # zero grads + + _, (denom, alphas, betas, llForward, llBackward, durations) = self._prepare_workspace() + + ######## START EXECUTION ######## + self.log_softmax(label_acts, denom) + + r = random.uniform(0, 1) + if r < self.omega: + # Compute alphas + gpu_rnnt_kernel.compute_alphas_kernel[self.minibatch_, self.maxU_, self.stream_, 0]( + label_acts, + denom, + alphas, + llForward, + input_lengths, + label_lengths, + labels, + self.minibatch_, + self.maxT_, + self.maxU_, + self.alphabet_size_, + self.blank_, + ) + else: + # Compute alphas + gpu_rnnt_kernel.compute_tdt_alphas_kernel[self.minibatch_, self.maxU_, self.stream_, 0]( + label_acts, + duration_acts, + denom, + self.sigma, + alphas, + llForward, + input_lengths, + label_lengths, + labels, + self.minibatch_, + self.maxT_, + self.maxU_, + self.alphabet_size_, + self.blank_, + durations, + self.num_durations, + ) + + if training: + # Compute betas + if r < self.omega: + gpu_rnnt_kernel.compute_betas_kernel[self.minibatch_, self.maxU_, self.stream_, 0]( + label_acts, + denom, + betas, + llBackward, + input_lengths, + label_lengths, + labels, + self.minibatch_, + self.maxT_, + self.maxU_, + self.alphabet_size_, + self.blank_, + ) + + # Compute gradient + grad_blocks_per_grid = self.minibatch_ * self.maxT_ * self.maxU_ + grad_threads_per_block = gpu_rnnt_kernel.GPU_RNNT_THREAD_SIZE + gpu_rnnt_kernel.compute_grad_kernel[grad_blocks_per_grid, grad_threads_per_block, self.stream_, 0]( + label_grads, + label_acts, + denom, + alphas, + betas, + llForward, + input_lengths, + label_lengths, + labels, + self.minibatch_, + self.maxT_, + self.maxU_, + self.alphabet_size_, + self.blank_, + self.fastemit_lambda_, + self.clamp_, + ) + else: + gpu_rnnt_kernel.compute_tdt_betas_kernel[self.minibatch_, self.maxU_, self.stream_, 0]( + label_acts, + duration_acts, + denom, + self.sigma, + betas, + llBackward, + input_lengths, + label_lengths, + labels, + self.minibatch_, + self.maxT_, + self.maxU_, + self.alphabet_size_, + self.blank_, + durations, + self.num_durations, + ) + + # Compute gradient + grad_blocks_per_grid = self.minibatch_ * self.maxT_ * self.maxU_ + grad_threads_per_block = gpu_rnnt_kernel.GPU_RNNT_THREAD_SIZE + gpu_rnnt_kernel.compute_tdt_grad_kernel[grad_blocks_per_grid, grad_threads_per_block, self.stream_, 0]( + label_grads, + duration_grads, + label_acts, + duration_acts, + denom, + self.sigma, + alphas, + betas, + llForward, + input_lengths, + label_lengths, + labels, + self.minibatch_, + self.maxT_, + self.maxU_, + self.alphabet_size_, + self.blank_, + durations, + self.num_durations, + self.fastemit_lambda_, + self.clamp_, + ) + + # // cost copy, negate (for log likelihood) and update with additional regularizers + # This needs to be done via CUDA, because we used temporary memory llForward + # passed to alpha, which was updated with log likelihoods. + # But copying this data into a pytorch pointer is more difficult (numba api is one way) + # Therefore launch a pointwise CUDA kernel to update the costs inplace from data of llForward + # Then negate to compute the loglikelihood. + threadsperblock = min(costs.shape[0], 32) + blockspergrid = (costs.shape[0] + (threadsperblock - 1)) // threadsperblock + rnnt_helper.compute_costs_data[blockspergrid, threadsperblock, self.stream_, 0]( + llForward, costs, self.fastemit_lambda_ + ) + self.stream_.synchronize() + + return global_constants.RNNTStatus.RNNT_STATUS_SUCCESS + + def cost_and_grad( + self, + label_acts: torch.Tensor, + duration_acts: torch.Tensor, + label_grads: torch.Tensor, + duration_grads: torch.Tensor, + costs: torch.Tensor, + pad_labels: torch.Tensor, + label_lengths: torch.Tensor, + input_lengths: torch.Tensor, + ): + if ( + duration_acts is None + or label_acts is None + or label_grads is None + or duration_grads is None + or costs is None + or pad_labels is None + or label_lengths is None + or input_lengths is None + ): + return global_constants.RNNTStatus.RNNT_STATUS_INVALID_VALUE + + return self.compute_cost_and_score( + label_acts, duration_acts, label_grads, duration_grads, costs, pad_labels, label_lengths, input_lengths + ) + + def score_forward( + self, + label_acts: torch.Tensor, + duration_acts: torch.Tensor, + costs: torch.Tensor, + pad_labels: torch.Tensor, + label_lengths: torch.Tensor, + input_lengths: torch.Tensor, + ): + if ( + label_acts is None + or duration_acts is None + or costs is None + or pad_labels is None + or label_lengths is None + or input_lengths is None + ): + return global_constants.RNNTStatus.RNNT_STATUS_INVALID_VALUE + + return self.compute_cost_and_score( + label_acts, duration_acts, None, None, costs, pad_labels, label_lengths, input_lengths + ) + + def _prepare_workspace(self) -> (int, Tuple[torch.Tensor]): + """ + Helper method that uses the workspace and constructs slices of it that can be used. + + Returns: + An int, representing the offset of the used workspace (practically, the slice of the workspace consumed) + A tuple of tensors representing the shared workspace. + """ + used_offset, (denom, alphas, betas, llForward, llBackward) = super()._prepare_workspace() + + durations = self.tdt_workspace[: self.num_durations] + + return used_offset, (denom, alphas, betas, llForward, llBackward, durations) diff --git a/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt_kernel.py b/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt_kernel.py index dbeb1544e7e3..4153af060941 100644 --- a/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt_kernel.py +++ b/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt_kernel.py @@ -35,6 +35,8 @@ GPU_RNNT_THREAD_SIZE = 256 +INF = 10000.0 + @cuda.jit(device=True, inline=True) def logp( @@ -62,6 +64,12 @@ def logp( return denom[col] + acts[col * alphabet_size + v] +@cuda.jit(device=True, inline=True) +def logp_duration(acts: torch.Tensor, maxT: int, maxU: int, num_durations: int, mb: int, t: int, u: int, v: int): + col = (mb * maxT + t) * maxU + u + return acts[col * num_durations + v] + + @cuda.jit() def compute_alphas_kernel( acts: torch.Tensor, @@ -875,3 +883,526 @@ def compute_multiblank_grad_kernel( # update internal index through the thread_buffer; # until idx < V + 1, such that entire vocabulary has been updated. idx += GPU_RNNT_THREAD_SIZE + + +@cuda.jit() +def compute_tdt_alphas_kernel( + acts: torch.Tensor, + duration_acts: torch.Tensor, + denom: torch.Tensor, + sigma: float, + alphas: torch.Tensor, + llForward: torch.Tensor, + xlen: torch.Tensor, + ylen: torch.Tensor, + mlabels: torch.Tensor, # [B] + minibatch: int, + maxT: int, + maxU: int, + alphabet_size: int, + blank_: int, + durations: torch.Tensor, + num_durations: int, +): + """ + Compute alpha (forward variable) probabilities over the transduction step. + + Args: + acts: Tensor of shape [B, T, U, V] flattened. Represents the logprobs activation tensor for tokens. + duration_acts: Tensor of shape [B, T, U, D] flattened. Represents the logprobs activation tensor for duration. + denom: Tensor of shape [B, T, U] flattened. Represents the denominator of the logprobs activation tensor for tokens. + + alphas: Zero tensor of shape [B, T, U]. Will be updated inside the kernel with the forward variable + probabilities. + llForward: Zero tensor of shape [B]. Represents the log-likelihood of the forward pass. + Returned as the forward pass loss that is reduced by the optimizer. + xlen: Vector of length B which contains the actual acoustic sequence lengths in the padded + activation tensor. + ylen: Vector of length B which contains the actual target sequence lengths in the padded + activation tensor. + mlabels: Matrix of shape [B, U+1] (+1 here is due to token - usually the RNNT blank). + The matrix contains the padded target transcription that must be predicted. + minibatch: Int representing the batch size. + maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. + maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. + alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). + blank_: Index of the TDT blank token in the vocabulary. Must be the last token in the vocab. + + Updates: + Kernel inplace updates the following inputs: + - alphas: forward variable scores. + - llForward: log-likelihood of forward variable. + """ + # // launch B blocks, each block has U threads + b = cuda.blockIdx.x # // batch id + u = cuda.threadIdx.x # label id, u + T = xlen[b] # select AM length of current sample + U = ylen[b] + 1 # select target length of current sample, +1 for the blank token + + labels: torch.Tensor = mlabels[b] # mb label start point, equivalent to mlabels + b * (maxU - 1) + offset = b * maxT * maxU # pointer indexing offset + + # alphas += offset # pointer offset, ignored since we explicitly add offset + + # Initilize alpha[b, t=0, u=0] for all b in B + if u == 0: + alphas[offset] = 0 + + # sync until all alphas are initialized + cuda.syncthreads() + + # Ordinary alpha calculations, broadcast across B=b and U=u + # Look up forward variable calculation from rnnt_numpy.forward_pass() + for n in range(1, T + U - 1): + t = n - u + + if u == 0: + # when u == 0, we only consider blank emissions. + if t > 0 and t < T: + alphas[offset + t * maxU + u] = -INF + + for i in range(1, num_durations): # skip 0 since blank emission has to advance by at least one + if t >= durations[i]: + alphas[offset + t * maxU + u] = rnnt_helper.log_sum_exp( + alphas[offset + t * maxU + u], # the current alpha value + alphas[offset + (t - durations[i]) * maxU + u] # alpha(t - duration, u) + + logp( + denom, acts, maxT, maxU, alphabet_size, b, t - durations[i], u, blank_ + ) # logp of blank emission + - sigma # logit under-normalization + + logp_duration( + duration_acts, maxT, maxU, num_durations, b, t - durations[i], u, i + ), # logp of duration + ) + else: + break # since durations are in ascending order, when we encounter a duration that is too large, then + # there is no need to check larger durations after that. + + elif u < U: + # when t == 0, we only consider the non-blank emission. + if t == 0: + alphas[offset + u] = ( + alphas[offset + u - 1] # alpha(t, u - 1) + + logp( + denom, acts, maxT, maxU, alphabet_size, b, t, u - 1, labels[u - 1] + ) # logp of token emission + - sigma # logit under-normalization + + logp_duration( + duration_acts, maxT, maxU, num_durations, b, t, u - 1, 0 + ) # t = 0, so it must be duration = 0. Therefore the last argument passed to logp_duration() is 0. + ) + + # now we have t != 0 and u != 0, and we need to consider both non-blank and blank emissions. + elif t > 0 and t < T: + no_emit = -INF # no_emit stores the score for all blank emissions. + for i in range(1, num_durations): + if t >= durations[i]: + no_emit = rnnt_helper.log_sum_exp( + no_emit, # current score + alphas[offset + (t - durations[i]) * maxU + u] # alpha(t - duration, u) + + logp( + denom, acts, maxT, maxU, alphabet_size, b, t - durations[i], u, blank_ + ) # logp of blank emission + - sigma # logit under-normalization + + logp_duration( + duration_acts, maxT, maxU, num_durations, b, t - durations[i], u, i + ), # logp of duration + ) + else: + break # we can exit the loop early here, same as the case for u == 0 above. + + emit = -INF # emit stores the score for non-blank emissions. + for i in range(0, num_durations): + if t >= durations[i]: + emit = rnnt_helper.log_sum_exp( + emit, # current score + alphas[offset + (t - durations[i]) * maxU + u - 1] # alpha(t - duration, u - 1) + + logp( + denom, acts, maxT, maxU, alphabet_size, b, t - durations[i], u - 1, labels[u - 1] + ) # logp of non-blank emission + - sigma # logit under-normalization + + logp_duration( + duration_acts, maxT, maxU, num_durations, b, t - durations[i], u - 1, i + ), # logp of duration + ) + else: + break # we can exit the loop early here, same as the case for u == 0 above. + + # combining blank and non-blank emissions. + alphas[offset + t * maxU + u] = rnnt_helper.log_sum_exp(emit, no_emit) + + # sync across all B=b and U=u + cuda.syncthreads() + + # After final sync, the forward log-likelihood can be computed as the summataion of + # alpha(T - duration, U - 1) + logp(blank, duration | t - duration, U - 1), over different durations. + if u == 0: + # first we consider duration = 1 + loglike = ( + alphas[offset + (T - 1) * maxU + U - 1] + + logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, U - 1, blank_) + - sigma + + logp_duration(duration_acts, maxT, maxU, num_durations, b, T - 1, U - 1, 1) + ) + + # then we add the scores for duration > 1, if such durations are possible given the audio lengths. + for i in range(2, num_durations): + if T >= durations[i]: + big_blank_loglike = ( + alphas[offset + (T - durations[i]) * maxU + U - 1] + + logp(denom, acts, maxT, maxU, alphabet_size, b, T - durations[i], U - 1, blank_) + - sigma + + logp_duration(duration_acts, maxT, maxU, num_durations, b, T - durations[i], U - 1, i) + ) + loglike = rnnt_helper.log_sum_exp(loglike, big_blank_loglike) + else: + break + + llForward[b] = loglike + + +@cuda.jit() +def compute_tdt_betas_kernel( + acts: torch.Tensor, + duration_acts: torch.Tensor, + denom: torch.Tensor, + sigma: float, + betas: torch.Tensor, + llBackward: torch.Tensor, + xlen: torch.Tensor, + ylen: torch.Tensor, + mlabels: torch.Tensor, # [B, U] + minibatch: int, + maxT: int, + maxU: int, + alphabet_size: int, + blank_: int, + durations: torch.Tensor, + num_durations: int, +): + """ + Compute beta (backward variable) probabilities over the transduction step. + + Args: + acts: Tensor of shape [B, T, U, V] flattened. Represents the logprobs activation tensor for tokens. + duration_acts: Tensor of shape [B, T, U, D] flattened. Represents the logprobs activation tensor for duations. + denom: Tensor of shape [B, T, U] flattened. Represents the denominator of the logprobs activation tensor + across entire vocabulary. + betas: Zero tensor of shape [B, T, U]. Will be updated inside the kernel with the backward variable + probabilities. + llBackward: Zero tensor of shape [B]. Represents the log-likelihood of the backward pass. + Returned as the backward pass loss that is reduced by the optimizer. + xlen: Vector of length B which contains the actual acoustic sequence lengths in the padded + activation tensor. + ylen: Vector of length B which contains the actual target sequence lengths in the padded + activation tensor. + mlabels: Matrix of shape [B, U+1] (+1 here is due to token - usually the RNNT blank). + The matrix contains the padded target transcription that must be predicted. + minibatch: Int representing the batch size. + maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. + maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. + alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). + blank_: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab. + + Updates: + Kernel inplace updates the following inputs: + - betas: backward variable scores. + - llBackward: log-likelihood of backward variable. + """ + # // launch B blocks, each block has U threads + b = cuda.blockIdx.x # // batch id + u = cuda.threadIdx.x # label id, u + T = xlen[b] # select AM length of current sample + U = ylen[b] + 1 # select target length of current sample, +1 for the blank token + + labels: torch.Tensor = mlabels[b] # mb label start point, equivalent to mlabels + b * (maxU - 1) + offset = b * maxT * maxU # pointer indexing offset + + # betas += offset # pointer offset, ignored since we explicitly add offset + + # Initilize beta[b, t=T-1, u=U-1] for all b in B with log_probs[b, t=T-1, u=U-1, blank] + if u == 0: + betas[offset + (T - 1) * maxU + U - 1] = ( + logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, U - 1, blank_) + - sigma + + logp_duration(duration_acts, maxT, maxU, num_durations, b, T - 1, U - 1, 1) + ) + + # sync until all betas are initialized + cuda.syncthreads() + + # Ordinary beta calculations, broadcast across B=b and U=u + # Look up backward variable calculation from rnnt_numpy.backward_pass() + for n in range(T + U - 2, -1, -1): + t = n - u + + if u == U - 1: + # u == U - 1, we only consider blank emissions. + if t >= 0 and t + 1 < T: + betas[offset + t * maxU + U - 1] = -INF + for i in range(1, num_durations): + # although similar, the computation for beta's is slightly more complex for boundary cases. + # the following two cases correspond to whether t is exactly certain duration away from T. + # and they have slightly different update rules. + + if t + durations[i] < T: + betas[offset + t * maxU + U - 1] = rnnt_helper.log_sum_exp( + betas[offset + t * maxU + U - 1], + betas[ + offset + (t + durations[i]) * maxU + U - 1 + ] # beta[t, U - 1] depends on the value beta[t + duration, U - 1] here. + + logp(denom, acts, maxT, maxU, alphabet_size, b, t, U - 1, blank_) # log prob of blank + + logp_duration( + duration_acts, maxT, maxU, num_durations, b, t, U - 1, i + ) # log prob of duration (durations[i]) + - sigma, # for logit undernormalization + ) + elif t + durations[i] == T: + betas[offset + t * maxU + U - 1] = rnnt_helper.log_sum_exp( + betas[offset + t * maxU + U - 1], + # here we have one fewer term than the "if" block above. This could be seen as having "0" here since + # beta[t + duration, U - 1] isn't defined because t + duration is out of bound. + logp(denom, acts, maxT, maxU, alphabet_size, b, t, U - 1, blank_) # log prob of blank + + logp_duration( + duration_acts, maxT, maxU, num_durations, b, t, U - 1, i + ) # log prob of duration (durations[i]) + - sigma, # for logit undernormalization. Basically every time sigma shows up is because of logit undernormalization. + ) + + elif u < U - 1: + if t == T - 1: + # t == T - 1, so we only consider non-blank with duration 0. (Note, we can't have blank emissions with duration = 0) + betas[offset + (T - 1) * maxU + u] = ( + betas[offset + (T - 1) * maxU + u + 1] + + logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, u, labels[u]) # non-blank log prob + + logp_duration(duration_acts, maxT, maxU, num_durations, b, T - 1, u, 0) # log prob of duration 0 + - sigma + ) + + elif t >= 0 and t < T - 1: + # now we need to consider both blank andnon-blanks. Similar to alphas, we first compute them separately with no_emit and emit. + no_emit = -INF + for i in range(1, num_durations): + if t + durations[i] < T: + no_emit = rnnt_helper.log_sum_exp( + no_emit, + betas[offset + (t + durations[i]) * maxU + u] + + logp(denom, acts, maxT, maxU, alphabet_size, b, t, u, blank_) + + logp_duration(duration_acts, maxT, maxU, num_durations, b, t, u, i) + - sigma, + ) + + emit = -INF + for i in range(0, num_durations): + if t + durations[i] < T: + emit = rnnt_helper.log_sum_exp( + emit, + betas[offset + (t + durations[i]) * maxU + u + 1] + + logp(denom, acts, maxT, maxU, alphabet_size, b, t, u, labels[u]) + + logp_duration(duration_acts, maxT, maxU, num_durations, b, t, u, i) + - sigma, + ) + + # combining all blank emissions and all non-blank emissions. + betas[offset + t * maxU + u] = rnnt_helper.log_sum_exp(emit, no_emit) + + # sync across all B=b and U=u + cuda.syncthreads() + + # After final sync, betas[b, 0, 0] gives log-likelihood of backward pass, same with conventional Transducers. + if u == 0: + llBackward[b] = betas[offset] + + +@cuda.jit() +def compute_tdt_grad_kernel( + label_grads: torch.Tensor, + duration_grads: torch.Tensor, + acts: torch.Tensor, + duration_acts: torch.Tensor, + denom: torch.Tensor, + sigma: float, + alphas: torch.Tensor, + betas: torch.Tensor, + logll: torch.Tensor, + xlen: torch.Tensor, + ylen: torch.Tensor, + mlabels: torch.Tensor, # [B, U] + minibatch: int, + maxT: int, + maxU: int, + alphabet_size: int, + blank_: int, + durations: torch.Tensor, + num_durations: int, + fastemit_lambda: float, + clamp: float, +): + """ + Compute gradients over the transduction step. + + Args: + grads: Zero Tensor of shape [B, T, U, V] to store gradients for tokens. + duration_grads: Zero Tensor of shape [B, T, U, D] to store gradients for durations. + + acts: Tensor of shape [B, T, U, V] flattened. Represents the logprobs activation tensor for tokens. + duration_acts: Tensor of shape [B, T, U, D] flattened. Represents the logprobs activation tensor for durations. + denom: Tensor of shape [B, T, U] flattened. Represents the denominator of the logprobs activation tensor + across entire vocabulary. + alphas: Alpha variable, contains forward probabilities. A tensor of shape [B, T, U]. + betas: Beta varoable, contains backward probabilities. A tensor of shape [B, T, U]. + logll: Log-likelihood of the forward variable, represented as a vector of shape [B]. + Represents the log-likelihood of the forward pass. + xlen: Vector of length B which contains the actual acoustic sequence lengths in the padded + activation tensor. + ylen: Vector of length B which contains the actual target sequence lengths in the padded + activation tensor. + mlabels: Matrix of shape [B, U+1] (+1 here is due to token - usually the RNNT blank). + The matrix contains the padded target transcription that must be predicted. + minibatch: Int representing the batch size. + maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. + maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. + alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). + blank_: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab. + fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to + FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. + clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. + + Updates: + Kernel inplace updates the following inputs: + - grads: Gradients with respect to the log likelihood (logll). + """ + # Kernel call: + # blocks_per_grid = minibatch (b) * maxT (t) * maxU (u) + # threads_per_block = constant buffer size of parallel threads (v :: Constant) + tid = cuda.threadIdx.x # represents v, taking steps of some constant size + idx = tid # index of v < V+1; in steps of constant buffer size + col = cuda.blockIdx.x # represents a fused index of b * t * u + + # Decompose original indices from fused `col` + u = col % maxU # (b * t * u) % u = u + bt = (col - u) // maxU # (b * t * u - u) // U = b * t + t = bt % maxT # (b * t) % t = t + mb = (bt - t) // maxT # (b * t - t) // T = b + + # constants + T = xlen[mb] # select AM length of current sample + U = ylen[mb] + 1 # select target length of current sample, +1 for the blank token + labels: torch.Tensor = mlabels[mb] # labels = mlabels + mb * (maxU - 1); + + # Buffered gradient calculations, broadcast across B=b, T=t and U=u, looped over V with some constant stride. + # Look up gradient calculation from rnnt_numpy.compute_gradient() + + if t < T and u < U: + logpk_blank = ( + denom[col] + acts[col * alphabet_size + blank_] - sigma + ) # whenever sigma is used, it is for logit under-normalization. + + if idx < num_durations: + grad = 0.0 + if t + durations[idx] < T and u < U - 1: # for label + logpk_label = denom[col] + acts[col * alphabet_size + labels[u]] - sigma + grad -= math.exp(alphas[col] + betas[col + 1 + durations[idx] * maxU] + logpk_label - logll[mb]) + + if t + durations[idx] < T and idx > 0: # for blank in the middle + grad -= math.exp(alphas[col] + betas[col + durations[idx] * maxU] + logpk_blank - logll[mb]) + + if t + durations[idx] == T and idx >= 1 and u == U - 1: # for blank as the last symbol + grad -= math.exp(alphas[col] + logpk_blank - logll[mb]) + + grad = grad * math.exp(duration_acts[col * num_durations + idx]) + duration_grads[col * num_durations + idx] = grad + + # For cuda kernels, maximum number of threads per block is limited to some value. + # However, it may be the case that vocabulary size is larger than this limit + # To work around this, an arbitrary thread buffer size is chosen such that, + # 1) each element within the thread pool operates independently of the other + # 2) An inner while loop moves the index of each buffer element by the size of the buffer itself, + # such that all elements of the vocabulary size are covered in (V + 1 // thread_buffer) number of steps. + # As such, each thread will perform the while loop at least (V + 1 // thread_buffer) number of times + while idx < alphabet_size: + # remember, `col` represents the tri-index [b, t, u] + # therefore; logpk = denom[b, t, u] + acts[b, t, u, v] + logpk = denom[col] + acts[col * alphabet_size + idx] + # initialize the grad of the sample acts[b, t, u, v] + grad = math.exp(alphas[col] + betas[col] + logpk - logll[mb]) + + # If FastEmit regularization is enabled, calculate the gradeint of probability of predicting the next label + # at the current timestep. + # The formula for this is Equation 9 in https://arxiv.org/abs/2010.11148, multiplied by the log probability + # of the current step (t, u), normalized by the total log likelihood. + # Once the gradient has been calculated, scale it by `fastemit_lambda`, as in Equation 10. + if fastemit_lambda > 0.0 and u < U - 1: + fastemit_grad = 0.0 + + for i in range(0, num_durations): + if t + durations[i] < T: + fastemit_grad += fastemit_lambda * math.exp( + alphas[col] # alphas(t, u) + + (denom[col] + acts[col * alphabet_size + labels[u]]) # log prob of token emission + + duration_acts[col * num_durations + i] # duration log-prob + + betas[col + 1 + durations[i] * maxU] # betas(t, u+1) + + logpk # log Pr(k|t, u) + - sigma # for logit under-normalization + - logll[mb] # total log likelihood for normalization + ) + else: + fastemit_grad = 0.0 + + # Update the gradient of act[b, t, u, v] with the gradient from FastEmit regularization + grad = grad + fastemit_grad + + # grad to last blank transition + # grad[b, T-1, U-1, v=blank] -= exp(alphas[b, t, u] + logpk - sigma - logll[b] + logp(duration) for all possible non-zero durations. + if idx == blank_ and u == U - 1: + for i in range(1, num_durations): + if t == T - durations[i]: + grad -= math.exp( + alphas[col] + logpk - sigma - logll[mb] + duration_acts[col * num_durations + i] + ) + + # grad of blank across t < T; + # grad[b, t 0.0: + g = label_grads[col * alphabet_size + idx] + g = min(g, clamp) + g = max(g, -clamp) + label_grads[col * alphabet_size + idx] = g + + # update internal index through the thread_buffer; + # until idx < V + 1, such that entire vocabulary has been updated. + idx += GPU_RNNT_THREAD_SIZE diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py index 5e98b03f2fe2..42b14fd7b8bf 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py @@ -2202,3 +2202,564 @@ class GreedyBatchedRNNTInferConfig: preserve_alignments: bool = False preserve_frame_confidence: bool = False confidence_method_cfg: Optional[ConfidenceMethodConfig] = None + + +class GreedyTDTInfer(_GreedyRNNTInfer): + """A greedy TDT decoder. + + Sequence level greedy decoding, performed auto-repressively. + + Args: + decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. + joint_model: rnnt_utils.AbstractRNNTJoint implementation. + blank_index: int index of the blank token. Must be len(vocabulary) for TDT models. + durations: a list containing durations for TDT. + max_symbols_per_step: Optional int. The maximum number of symbols that can be added + to a sequence in a single time step; if set to None then there is + no limit. + preserve_alignments: Bool flag which preserves the history of alignments generated during + greedy decoding (sample / batched). When set to true, the Hypothesis will contain + the non-null value for `alignments` in it. Here, `alignments` is a List of List of + Tuple(Tensor (of length V + 1 + num-big-blanks), Tensor(scalar, label after argmax)). + The length of the list corresponds to the Acoustic Length (T). + Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary. + U is the number of target tokens for the current timestep Ti. + preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores generated + during greedy decoding (sample / batched). When set to true, the Hypothesis will contain + the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of List of floats. + The length of the list corresponds to the Acoustic Length (T). + Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. + U is the number of target tokens for the current timestep Ti. + confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame + confidence scores. + name: The method name (str). + Supported values: + - 'max_prob' for using the maximum token probability as a confidence. + - 'entropy' for using normalized entropy of a log-likelihood vector. + entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. + Supported values: + - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). + Note that for this entropy, the temperature should comply the following inequality: + 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. + Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), + where α is a parameter. When α == 1, it works like the Gibbs entropy. + More: https://en.wikipedia.org/wiki/Tsallis_entropy + - 'renui' for the Rényi entropy. + Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), + where α is a parameter. When α == 1, it works like the Gibbs entropy. + More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy + temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the temperature equals one, scaling is not applied to 'max_prob', + and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) + entropy_norm: A mapping of the entropy value to the interval [0,1]. + Supported values: + - 'lin' for using the linear mapping. + - 'exp' for using exponential mapping with linear shift. + """ + + def __init__( + self, + decoder_model: rnnt_abstract.AbstractRNNTDecoder, + joint_model: rnnt_abstract.AbstractRNNTJoint, + blank_index: int, + durations: list, + max_symbols_per_step: Optional[int] = None, + preserve_alignments: bool = False, + preserve_frame_confidence: bool = False, + confidence_method_cfg: Optional[DictConfig] = None, + ): + super().__init__( + decoder_model=decoder_model, + joint_model=joint_model, + blank_index=blank_index, + max_symbols_per_step=max_symbols_per_step, + preserve_alignments=preserve_alignments, + preserve_frame_confidence=preserve_frame_confidence, + confidence_method_cfg=confidence_method_cfg, + ) + self.durations = durations + + @typecheck() + def forward( + self, + encoder_output: torch.Tensor, + encoded_lengths: torch.Tensor, + partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, + ): + """Returns a list of hypotheses given an input batch of the encoder hidden embedding. + Output token is generated auto-repressively. + Args: + encoder_output: A tensor of size (batch, features, timesteps). + encoded_lengths: list of int representing the length of each sequence + output sequence. + Returns: + packed list containing batch number of sentences (Hypotheses). + """ + # Preserve decoder and joint training state + decoder_training_state = self.decoder.training + joint_training_state = self.joint.training + + with torch.inference_mode(): + # Apply optional preprocessing + encoder_output = encoder_output.transpose(1, 2) # (B, T, D) + + self.decoder.eval() + self.joint.eval() + + hypotheses = [] + # Process each sequence independently + with self.decoder.as_frozen(), self.joint.as_frozen(): + for batch_idx in range(encoder_output.size(0)): + inseq = encoder_output[batch_idx, :, :].unsqueeze(1) # [T, 1, D] + logitlen = encoded_lengths[batch_idx] + + partial_hypothesis = partial_hypotheses[batch_idx] if partial_hypotheses is not None else None + hypothesis = self._greedy_decode(inseq, logitlen, partial_hypotheses=partial_hypothesis) + hypotheses.append(hypothesis) + + # Pack results into Hypotheses + packed_result = pack_hypotheses(hypotheses, encoded_lengths) + + self.decoder.train(decoder_training_state) + self.joint.train(joint_training_state) + + return (packed_result,) + + @torch.no_grad() + def _greedy_decode( + self, x: torch.Tensor, out_len: torch.Tensor, partial_hypotheses: Optional[rnnt_utils.Hypothesis] = None + ): + # x: [T, 1, D] + # out_len: [seq_len] + + # Initialize blank state and empty label set in Hypothesis + hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None) + + if partial_hypotheses is not None: + hypothesis.last_token = partial_hypotheses.last_token + hypothesis.y_sequence = ( + partial_hypotheses.y_sequence.cpu().tolist() + if isinstance(partial_hypotheses.y_sequence, torch.Tensor) + else partial_hypotheses.y_sequence + ) + if partial_hypotheses.dec_state is not None: + hypothesis.dec_state = self.decoder.batch_concat_states([partial_hypotheses.dec_state]) + hypothesis.dec_state = _states_to_device(hypothesis.dec_state, x.device) + + if self.preserve_alignments: + # Alignments is a 2-dimensional dangling list representing T x U + hypothesis.alignments = [[]] + + if self.preserve_frame_confidence: + hypothesis.frame_confidence = [[]] + + time_idx = 0 + while time_idx < out_len: + # Extract encoder embedding at timestep t + # f = x[time_idx, :, :].unsqueeze(0) # [1, 1, D] + f = x.narrow(dim=0, start=time_idx, length=1) + + # Setup exit flags and counter + not_blank = True + symbols_added = 0 + + need_loop = True + # While blank is not predicted, or we dont run out of max symbols per timestep + while need_loop and (self.max_symbols is None or symbols_added < self.max_symbols): + # In the first timestep, we initialize the network with RNNT Blank + # In later timesteps, we provide previous predicted label as input. + if hypothesis.last_token is None and hypothesis.dec_state is None: + last_label = self._SOS + else: + last_label = label_collate([[hypothesis.last_token]]) + + # Perform prediction network and joint network steps. + g, hidden_prime = self._pred_step(last_label, hypothesis.dec_state) + # If preserving per-frame confidence, log_normalize must be true + logits = self._joint_step(f, g, log_normalize=False) + logp = logits[0, 0, 0, : -len(self.durations)] + if self.preserve_frame_confidence: + logp = torch.log_softmax(logp, -1) + + duration_logp = torch.log_softmax(logits[0, 0, 0, -len(self.durations) :], dim=-1) + del g + + # torch.max(0) op doesnt exist for FP 16. + if logp.dtype != torch.float32: + logp = logp.float() + + # get index k, of max prob + v, k = logp.max(0) + k = k.item() # K is the label at timestep t_s in inner loop, s >= 0. + + d_v, d_k = duration_logp.max(0) + d_k = d_k.item() + + skip = self.durations[d_k] + + if self.preserve_alignments: + # insert logprobs into last timestep + hypothesis.alignments[-1].append((logp.to('cpu'), torch.tensor(k, dtype=torch.int32))) + + if self.preserve_frame_confidence: + # insert confidence into last timestep + hypothesis.frame_confidence[-1].append(self._get_confidence(logp)) + + del logp + + # If blank token is predicted, exit inner loop, move onto next timestep t + if k == self._blank_index: + not_blank = False + + # this rarely happens, but we manually increment the `skip` number + # if blank is emitted and duration=0 is predicted. This prevents possible + # infinite loops. + if skip == 0: + skip = 1 + + if self.preserve_alignments: + # convert Ti-th logits into a torch array + hypothesis.alignments.append([]) # blank buffer for next timestep + + if self.preserve_frame_confidence: + hypothesis.frame_confidence.append([]) # blank buffer for next timestep + else: + # Append token to label set, update RNN state. + hypothesis.y_sequence.append(k) + hypothesis.score += float(v) + hypothesis.timestep.append(time_idx) + hypothesis.dec_state = hidden_prime + hypothesis.last_token = k + + # Increment token counter. + symbols_added += 1 + time_idx += skip + need_loop = skip == 0 + + if symbols_added == self.max_symbols: + time_idx += 1 + + # Remove trailing empty list of Alignments + if self.preserve_alignments: + if len(hypothesis.alignments[-1]) == 0: + del hypothesis.alignments[-1] + + # Remove trailing empty list of per-frame confidence + if self.preserve_frame_confidence: + if len(hypothesis.frame_confidence[-1]) == 0: + del hypothesis.frame_confidence[-1] + + # Unpack the hidden states + hypothesis.dec_state = self.decoder.batch_select_state(hypothesis.dec_state, 0) + + return hypothesis + + +class GreedyBatchedTDTInfer(_GreedyRNNTInfer): + """A batch level greedy TDT decoder. + Batch level greedy decoding, performed auto-repressively. + Args: + decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. + joint_model: rnnt_utils.AbstractRNNTJoint implementation. + blank_index: int index of the blank token. Must be len(vocabulary) for TDT models. + durations: a list containing durations. + max_symbols_per_step: Optional int. The maximum number of symbols that can be added + to a sequence in a single time step; if set to None then there is + no limit. + preserve_alignments: Bool flag which preserves the history of alignments generated during + greedy decoding (sample / batched). When set to true, the Hypothesis will contain + the non-null value for `alignments` in it. Here, `alignments` is a List of List of + Tuple(Tensor (of length V + 1 + num-big-blanks), Tensor(scalar, label after argmax)). + The length of the list corresponds to the Acoustic Length (T). + Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary. + U is the number of target tokens for the current timestep Ti. + preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores generated + during greedy decoding (sample / batched). When set to true, the Hypothesis will contain + the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of List of floats. + The length of the list corresponds to the Acoustic Length (T). + Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. + U is the number of target tokens for the current timestep Ti. + confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame + confidence scores. + name: The method name (str). + Supported values: + - 'max_prob' for using the maximum token probability as a confidence. + - 'entropy' for using normalized entropy of a log-likelihood vector. + entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. + Supported values: + - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). + Note that for this entropy, the temperature should comply the following inequality: + 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. + Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), + where α is a parameter. When α == 1, it works like the Gibbs entropy. + More: https://en.wikipedia.org/wiki/Tsallis_entropy + - 'renui' for the Rényi entropy. + Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), + where α is a parameter. When α == 1, it works like the Gibbs entropy. + More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy + temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the temperature equals one, scaling is not applied to 'max_prob', + and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) + entropy_norm: A mapping of the entropy value to the interval [0,1]. + Supported values: + - 'lin' for using the linear mapping. + - 'exp' for using exponential mapping with linear shift. + """ + + def __init__( + self, + decoder_model: rnnt_abstract.AbstractRNNTDecoder, + joint_model: rnnt_abstract.AbstractRNNTJoint, + blank_index: int, + durations: List[int], + max_symbols_per_step: Optional[int] = None, + preserve_alignments: bool = False, + preserve_frame_confidence: bool = False, + confidence_method_cfg: Optional[DictConfig] = None, + ): + super().__init__( + decoder_model=decoder_model, + joint_model=joint_model, + blank_index=blank_index, + max_symbols_per_step=max_symbols_per_step, + preserve_alignments=preserve_alignments, + preserve_frame_confidence=preserve_frame_confidence, + confidence_method_cfg=confidence_method_cfg, + ) + self.durations = durations + + # Depending on availability of `blank_as_pad` support + # switch between more efficient batch decoding technique + if self.decoder.blank_as_pad: + self._greedy_decode = self._greedy_decode_blank_as_pad + else: + self._greedy_decode = self._greedy_decode_masked + + @typecheck() + def forward( + self, + encoder_output: torch.Tensor, + encoded_lengths: torch.Tensor, + partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, + ): + """Returns a list of hypotheses given an input batch of the encoder hidden embedding. + Output token is generated auto-repressively. + Args: + encoder_output: A tensor of size (batch, features, timesteps). + encoded_lengths: list of int representing the length of each sequence + output sequence. + Returns: + packed list containing batch number of sentences (Hypotheses). + """ + # Preserve decoder and joint training state + decoder_training_state = self.decoder.training + joint_training_state = self.joint.training + + with torch.inference_mode(): + # Apply optional preprocessing + encoder_output = encoder_output.transpose(1, 2) # (B, T, D) + logitlen = encoded_lengths + + self.decoder.eval() + self.joint.eval() + + with self.decoder.as_frozen(), self.joint.as_frozen(): + inseq = encoder_output # [B, T, D] + hypotheses = self._greedy_decode( + inseq, logitlen, device=inseq.device, partial_hypotheses=partial_hypotheses + ) + + # Pack the hypotheses results + packed_result = pack_hypotheses(hypotheses, logitlen) + + self.decoder.train(decoder_training_state) + self.joint.train(joint_training_state) + + return (packed_result,) + + def _greedy_decode_blank_as_pad( + self, + x: torch.Tensor, + out_len: torch.Tensor, + device: torch.device, + partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, + ): + if partial_hypotheses is not None: + raise NotImplementedError("`partial_hypotheses` support is not supported") + + with torch.inference_mode(): + # x: [B, T, D] + # out_len: [B] + # device: torch.device + + # Initialize list of Hypothesis + batchsize = x.shape[0] + hypotheses = [ + rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batchsize) + ] + + # Initialize Hidden state matrix (shared by entire batch) + hidden = None + + # If alignments need to be preserved, register a danling list to hold the values + if self.preserve_alignments: + # alignments is a 3-dimensional dangling list representing B x T x U + for hyp in hypotheses: + hyp.alignments = [[]] + + # If confidence scores need to be preserved, register a danling list to hold the values + if self.preserve_frame_confidence: + # frame_confidence is a 3-dimensional dangling list representing B x T x U + for hyp in hypotheses: + hyp.frame_confidence = [[]] + hyp.y_3best = [[]] + hyp.frame_confidence_3best = [[[]]] + hyp.logp = [[]] + + # Last Label buffer + Last Label without blank buffer + # batch level equivalent of the last_label + last_label = torch.full([batchsize, 1], fill_value=self._blank_index, dtype=torch.long, device=device) + + # Mask buffers + blank_mask = torch.full([batchsize], fill_value=0, dtype=torch.bool, device=device) + + # Get max sequence length + max_out_len = out_len.max() + + # skip means the number of frames the next decoding step should "jump" to. When skip == 1 + # it means the next decoding step will just use the next input frame. + skip = 1 + for time_idx in range(max_out_len): + if skip > 1: # if skip > 1 at the current step, we decrement it and skip the current frame. + skip -= 1 + continue + f = x.narrow(dim=1, start=time_idx, length=1) # [B, 1, D] + + # need_to_stay is a boolean indicates whether the next decoding step should remain in the same frame. + need_to_stay = True + symbols_added = 0 + + # Reset blank mask + blank_mask.mul_(False) + + # Update blank mask with time mask + # Batch: [B, T, D], but Bi may have seq len < max(seq_lens_in_batch) + # Forcibly mask with "blank" tokens, for all sample where current time step T > seq_len + blank_mask = time_idx >= out_len + + # Start inner loop + while need_to_stay and (self.max_symbols is None or symbols_added < self.max_symbols): + # Batch prediction and joint network steps + # If very first prediction step, submit SOS tag (blank) to pred_step. + # This feeds a zero tensor as input to AbstractRNNTDecoder to prime the state + if time_idx == 0 and symbols_added == 0 and hidden is None: + g, hidden_prime = self._pred_step(self._SOS, hidden, batch_size=batchsize) + else: + # Perform batch step prediction of decoder, getting new states and scores ("g") + g, hidden_prime = self._pred_step(last_label, hidden, batch_size=batchsize) + + # Batched joint step - Output = [B, V + 1 + num-big-blanks] + # Note: log_normalize must not be True here since the joiner output is contanetation of both token logits and duration logits, + # and they need to be normalized independently. + joined = self._joint_step(f, g, log_normalize=None) + logp = joined[:, 0, 0, : -len(self.durations)] + duration_logp = joined[:, 0, 0, -len(self.durations) :] + + if logp.dtype != torch.float32: + logp = logp.float() + duration_logp = duration_logp.float() + + # get the max for both token and duration predictions. + v, k = logp.max(1) + dv, dk = duration_logp.max(1) + + # here we set the skip value to be the minimum of all predicted durations, hense the "torch.min(dk)" call there. + # Please refer to Section 5.2 of our paper https://arxiv.org/pdf/2304.06795.pdf for explanation of this. + skip = self.durations[int(torch.min(dk))] + + # this is a special case: if all batches emit blanks, we require that skip be at least 1 + # so we don't loop forever at the current frame. + if blank_mask.all(): + if skip == 0: + skip = 1 + + need_to_stay = skip == 0 + del g + + # Update blank mask with current predicted blanks + # This is accumulating blanks over all time steps T and all target steps min(max_symbols, U) + k_is_blank = k == self._blank_index + blank_mask.bitwise_or_(k_is_blank) + + del k_is_blank + del logp, duration_logp + + # If all samples predict / have predicted prior blanks, exit loop early + # This is equivalent to if single sample predicted k + if not blank_mask.all(): + # Collect batch indices where blanks occurred now/past + blank_indices = (blank_mask == 1).nonzero(as_tuple=False) + + # Recover prior state for all samples which predicted blank now/past + if hidden is not None: + hidden_prime = self.decoder.batch_copy_states(hidden_prime, hidden, blank_indices) + + elif len(blank_indices) > 0 and hidden is None: + # Reset state if there were some blank and other non-blank predictions in batch + # Original state is filled with zeros so we just multiply + # LSTM has 2 states + hidden_prime = self.decoder.batch_copy_states(hidden_prime, None, blank_indices, value=0.0) + + # Recover prior predicted label for all samples which predicted blank now/past + k[blank_indices] = last_label[blank_indices, 0] + + # Update new label and hidden state for next iteration + last_label = k.clone().view(-1, 1) + hidden = hidden_prime + + # Update predicted labels, accounting for time mask + # If blank was predicted even once, now or in the past, + # Force the current predicted label to also be blank + # This ensures that blanks propogate across all timesteps + # once they have occured (normally stopping condition of sample level loop). + for kidx, ki in enumerate(k): + if blank_mask[kidx] == 0: + hypotheses[kidx].y_sequence.append(ki) + hypotheses[kidx].timestep.append(time_idx) + hypotheses[kidx].score += float(v[kidx]) + + symbols_added += 1 + + # Remove trailing empty list of alignments at T_{am-len} x Uj + if self.preserve_alignments: + for batch_idx in range(batchsize): + if len(hypotheses[batch_idx].alignments[-1]) == 0: + del hypotheses[batch_idx].alignments[-1] + + # Remove trailing empty list of confidence scores at T_{am-len} x Uj + if self.preserve_frame_confidence: + for batch_idx in range(batchsize): + if len(hypotheses[batch_idx].frame_confidence[-1]) == 0: + del hypotheses[batch_idx].frame_confidence[-1] + del hypotheses[batch_idx].y_3best[-1] + del hypotheses[batch_idx].frame_confidence_3best[-1] + del hypotheses[batch_idx].logp[-1] + + # Preserve states + for batch_idx in range(batchsize): + hypotheses[batch_idx].dec_state = self.decoder.batch_select_state(hidden, batch_idx) + + return hypotheses + + def _greedy_decode_masked( + self, + x: torch.Tensor, + out_len: torch.Tensor, + device: torch.device, + partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, + ): + raise NotImplementedError("masked greedy-batched decode is not supported for TDT models.") diff --git a/tests/collections/asr/decoding/test_rnnt_decoding.py b/tests/collections/asr/decoding/test_rnnt_decoding.py index 9dd955c24a70..ac90e62036e0 100644 --- a/tests/collections/asr/decoding/test_rnnt_decoding.py +++ b/tests/collections/asr/decoding/test_rnnt_decoding.py @@ -130,7 +130,7 @@ def test_constructor(self): @pytest.mark.unit def test_constructor_subword(self, tmp_tokenizer): - cfg = RNNTBPEDecodingConfig() + cfg = RNNTDecodingConfig() vocab = tmp_tokenizer.vocab decoder = get_rnnt_decoder(vocab_size=len(vocab)) joint = get_rnnt_joint(vocab_size=len(vocab)) diff --git a/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py b/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py index 7764649bf1fa..3fbfcf6df54b 100644 --- a/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py +++ b/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py @@ -18,9 +18,13 @@ import pytest import torch -from nemo.collections.asr.losses.rnnt import MultiblankRNNTLossPytorch, RNNTLossPytorch +from nemo.collections.asr.losses.rnnt import MultiblankRNNTLossPytorch, RNNTLossPytorch, TDTLossPytorch from nemo.collections.asr.parts.numba.rnnt_loss.rnnt_numpy import RNNTLoss as RNNTLoss_Numpy -from nemo.collections.asr.parts.numba.rnnt_loss.rnnt_pytorch import MultiblankRNNTLossNumba, RNNTLossNumba +from nemo.collections.asr.parts.numba.rnnt_loss.rnnt_pytorch import ( + MultiblankRNNTLossNumba, + RNNTLossNumba, + TDTLossNumba, +) from nemo.core.utils import numba_utils from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__ @@ -494,5 +498,68 @@ def test_case_randomized_act_label(self, device): assert np.allclose(pt_grads, ag_grads, rtol=1e-2), "multi-blank gradient mismatch." +class TestTDTLoss: + @pytest.mark.unit + @pytest.mark.parametrize('device', DEVICES) + def test_case_randomized_act_label(self, device): + if device == 'cuda': + numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) + + B, T, U, V = 4, 8, 4, 8 # here V is number of non blank labels + durations = [0, 1, 2, 3, 4, 5] + sigma = 0.05 + + acts = torch.rand([B, T, U, V + 1 + len(durations)]) + labels = [[random.randrange(0, V) for i in range(U - 1)] for j in range(B)] + + fn_pt = TDTLossNumba(blank=V, reduction='sum', durations=durations, sigma=sigma) + pt_cost, pt_grads = wrap_and_call(fn_pt, acts, labels, device) + + fn_ag = TDTLossPytorch( + blank=V, reduction='sum', durations=durations, sigma=sigma + ) # ag for automatic gradient computation + ag_cost, ag_grads = wrap_and_call(fn_ag, acts, labels, device) + + assert np.allclose(pt_cost, ag_cost, rtol=1e-6), "tdt costs mismatch." + assert np.allclose(pt_grads, ag_grads, rtol=1e-2), "td gradient mismatch." + + @pytest.mark.unit + @pytest.mark.parametrize('device', DEVICES) + def test_case_fixed_case_act_label(self, device): + if device == 'cuda': + numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) + + B, T, U, V = 1, 3, 2, 3 # here V is number of non blank labels + durations = [0, 1, 2] + sigma = 0.05 + + acts = torch.zeros([B, T, U, V + 1 + len(durations)]) + labels = [[(i + j) % (V - 1) for i in range(U - 1)] for j in range(B)] + + fn_pt = TDTLossNumba(blank=V, reduction='sum', durations=durations, sigma=sigma) + pt_cost, pt_grads = wrap_and_call(fn_pt, acts, labels, device) + + expected_cost = 4.155739 + expected_grads = [ + [ + [ + [-0.64962804, 0.25, 0.25, 0.14962798, 0.2672583, -0.16792619, -0.09933221], + [0.01651875, 0.01651875, 0.01651875, -0.04955626, 0.022025, -0.01227201, -0.009753], + ], + [ + [-0.04892651, 0.01714851, 0.01714851, 0.01462949, -0.01143234, -0.01143234, 0.02286467], + [0.12531489, 0.12531489, 0.12531489, -0.37594467, 0.16708651, 0.13027048, -0.29735702], + ], + [ + [-0.02572276, 0.00857425, 0.00857425, 0.00857425, -0.02286468, 0.01143234, 0.01143234], + [0.13388914, 0.13388914, 0.13388914, -0.40166742, 0.17851885, -0.35703772, 0.17851885], + ], + ] + ] + + assert np.allclose(pt_cost, expected_cost, rtol=1e-6), "tdt costs mismatch." + assert np.allclose(pt_grads, expected_grads, rtol=1e-2), "td gradient mismatch." + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/collections/asr/numba/rnnt_loss/utils/test_gpu_rnnt_kernel.py b/tests/collections/asr/numba/rnnt_loss/utils/test_gpu_rnnt_kernel.py index acab5963fa72..230b6b7c099f 100644 --- a/tests/collections/asr/numba/rnnt_loss/utils/test_gpu_rnnt_kernel.py +++ b/tests/collections/asr/numba/rnnt_loss/utils/test_gpu_rnnt_kernel.py @@ -17,6 +17,7 @@ import torch from numba import cuda +from nemo.collections.asr.losses.rnnt_pytorch import MultiblankRNNTLossPytorch, TDTLossPytorch from nemo.collections.asr.parts.numba.rnnt_loss import rnnt_numpy from nemo.collections.asr.parts.numba.rnnt_loss.rnnt_pytorch import certify_inputs from nemo.collections.asr.parts.numba.rnnt_loss.utils.cuda_utils import gpu_rnnt_kernel, reduce @@ -504,3 +505,189 @@ def test_compute_grads_kernel_clamp(self): assert np.abs(diff).mean() <= 1e-5 assert np.square(diff).mean() <= 1e-10 + + +class TestTDTCUDAKernels: + @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Reductions can only be run when CUDA is available") + @pytest.mark.unit + def test_compute_alphas_kernel(self): + numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) + + random = np.random.RandomState(0) + original_shape = [1, 15, 11, 3] + durations = [0, 1, 2] + B, T, U, V = original_shape + Vd = len(durations) + + duration_act_shape = [B, T, U, Vd] + sigma = 0.05 + + # for passing into the kernel function -- it expected unnormalized logits + x = random.randn(*original_shape) + # for passing into the pytorch function -- it expected normalized logits + normalized_x = log_softmax(x, axis=-1) - 0.05 + + xd = random.randn(*duration_act_shape) + # duration logits are normalized before passing into the loss computation. + xd = log_softmax(xd, axis=-1) + + labels = np.array([[1, 1, 1, 1, 0, 0, 1, 0, 0, 1]]) # [1, 10] + blank_idx = V - 1 + + pytorch_tdt_loss = TDTLossPytorch(blank_idx, durations, sigma=sigma) + + # Pytorch kernel + device = torch.device('cuda') + if hasattr(cuda, 'external_stream'): + stream = cuda.external_stream(torch.cuda.current_stream(device).cuda_stream) + else: + stream = cuda.default_stream() + + x = torch.tensor(x, device=device, dtype=torch.float32) + normalized_x = torch.tensor(normalized_x, device=device, dtype=torch.float32) + xd = torch.tensor(xd, device=device, dtype=torch.float32) + labels = torch.tensor(labels, device=device, dtype=torch.long) + durations = torch.tensor(durations, device=device, dtype=torch.long) + + # Allocate workspace memory + denom = torch.zeros(B * T * U, device=device, dtype=x.dtype) + alphas = torch.zeros(B * T * U, device=device, dtype=x.dtype) + llForward = torch.zeros(B, device=device, dtype=x.dtype) + input_lengths = torch.tensor([T], dtype=torch.long, device=device) + label_lengths = torch.tensor([U - 1], dtype=torch.long, device=device) + + ground_log_likelihood, ground_alphas = pytorch_tdt_loss.compute_forward_prob( + normalized_x, xd, labels, input_lengths, label_lengths + ) + + # certify input data + certify_inputs(x, labels, input_lengths, label_lengths) + + # flatten activation tensor (for pointer based indexing) + x = x.view([-1]) + xd = xd.view([-1]) + + # call kernel + # log softmax reduction + reduce.reduce_max(x, denom, rows=V, cols=B * T * U, minus=False, stream=stream) + reduce.reduce_exp(x, denom, rows=V, cols=B * T * U, minus=True, stream=stream) + + # alpha kernel + gpu_rnnt_kernel.compute_tdt_alphas_kernel[B, U, stream, 0]( + x, + xd, + denom, + sigma, + alphas, + llForward, + input_lengths, + label_lengths, + labels, + B, + T, + U, + V, + blank_idx, + durations, + Vd, + ) + + # sync kernel + stream.synchronize() + + # reshape alphas + alphas = alphas.view([B, T, U]) + diff = torch.norm(ground_alphas - alphas) + ll_diff = torch.norm(ground_log_likelihood - llForward) + + assert diff <= 1e-3 + assert ll_diff <= 1e-3 + + +class TestMultiblankRNNTCUDAKernels: + @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Reductions can only be run when CUDA is available") + @pytest.mark.unit + def test_compute_alphas_kernel(self): + numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) + + random = np.random.RandomState(0) + original_shape = [1, 15, 11, 6] + big_blank_durations = [2, 3, 4] + B, T, U, V = original_shape + num_big_blanks = len(big_blank_durations) + + sigma = 0.05 + + # for passing into the kernel function -- it expected unnormalized logits + x = random.randn(*original_shape) + # for passing into the pytorch function -- it expected normalized logits + normalized_x = log_softmax(x, axis=-1) - sigma + + labels = np.array([[1, 1, 1, 1, 0, 0, 1, 0, 0, 1]]) # [1, 10] + blank_idx = V - 1 + + pytorch_multiblank_loss = MultiblankRNNTLossPytorch(blank_idx, big_blank_durations, sigma=sigma) + + # Pytorch kernel + device = torch.device('cuda') + if hasattr(cuda, 'external_stream'): + stream = cuda.external_stream(torch.cuda.current_stream(device).cuda_stream) + else: + stream = cuda.default_stream() + + x = torch.tensor(x, device=device, dtype=torch.float32) + normalized_x = torch.tensor(normalized_x, device=device, dtype=torch.float32) + labels = torch.tensor(labels, device=device, dtype=torch.long) + big_blank_durations = torch.tensor(big_blank_durations, device=device, dtype=torch.long) + + # Allocate workspace memory + denom = torch.zeros(B * T * U, device=device, dtype=x.dtype) + alphas = torch.zeros(B * T * U, device=device, dtype=x.dtype) + llForward = torch.zeros(B, device=device, dtype=x.dtype) + input_lengths = torch.tensor([T], dtype=torch.long, device=device) + label_lengths = torch.tensor([U - 1], dtype=torch.long, device=device) + + ground_log_likelihood, ground_alphas = pytorch_multiblank_loss.compute_forward_prob( + normalized_x, labels, input_lengths, label_lengths + ) + + # certify input data + certify_inputs(x, labels, input_lengths, label_lengths) + + # flatten activation tensor (for pointer based indexing) + x = x.view([-1]) + + # call kernel + # log softmax reduction + reduce.reduce_max(x, denom, rows=V, cols=B * T * U, minus=False, stream=stream) + reduce.reduce_exp(x, denom, rows=V, cols=B * T * U, minus=True, stream=stream) + + # alpha kernel + gpu_rnnt_kernel.compute_multiblank_alphas_kernel[B, U, stream, 0]( + x, + denom, + sigma, + alphas, + llForward, + input_lengths, + label_lengths, + labels, + B, + T, + U, + V, + blank_idx, + big_blank_durations, + num_big_blanks, + ) + + # sync kernel + stream.synchronize() + + # reshape alphas + alphas = alphas.view([B, T, U]) + diff = torch.norm(ground_alphas - alphas) + ll_diff = torch.norm(ground_log_likelihood - llForward) + + assert diff <= 1e-3 + assert ll_diff <= 1e-3 diff --git a/tests/collections/asr/test_asr_rnnt_encdec_model.py b/tests/collections/asr/test_asr_rnnt_encdec_model.py index 5b30489f846c..68f1e38f797b 100644 --- a/tests/collections/asr/test_asr_rnnt_encdec_model.py +++ b/tests/collections/asr/test_asr_rnnt_encdec_model.py @@ -363,6 +363,50 @@ def test_multiblank_rnnt_greedy_decoding(self, greedy_class): with torch.no_grad(): _ = greedy(encoder_output=enc_out, encoded_lengths=enc_len) + @pytest.mark.skipif( + not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.', + ) + @pytest.mark.unit + @pytest.mark.parametrize( + "greedy_class", [greedy_decode.GreedyMultiblankRNNTInfer, greedy_decode.GreedyBatchedMultiblankRNNTInfer], + ) + def test_multiblank_rnnt_greedy_decoding(self, greedy_class): + token_list = [" ", "a", "b", "c"] + vocab_size = len(token_list) + big_blank_durations = [2, 4] + + encoder_output_size = 4 + decoder_output_size = 4 + joint_output_shape = 4 + + prednet_cfg = {'pred_hidden': decoder_output_size, 'pred_rnn_layers': 1} + jointnet_cfg = { + 'encoder_hidden': encoder_output_size, + 'pred_hidden': decoder_output_size, + 'joint_hidden': joint_output_shape, + 'activation': 'relu', + } + + decoder = RNNTDecoder(prednet_cfg, vocab_size) + joint_net = RNNTJoint( + jointnet_cfg, vocab_size, vocabulary=token_list, num_extra_outputs=len(big_blank_durations) + ) + + greedy = greedy_class( + decoder, + joint_net, + blank_index=len(token_list), + big_blank_durations=big_blank_durations, + max_symbols_per_step=5, + ) + + # (B, D, T) + enc_out = torch.randn(1, encoder_output_size, 30) + enc_len = torch.tensor([30], dtype=torch.int32) + + with torch.no_grad(): + _ = greedy(encoder_output=enc_out, encoded_lengths=enc_len) + @pytest.mark.skipif( not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.', ) From ef740068e75bf55aac14c1432707fc4ef136bb04 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 2 Jun 2023 17:53:33 -0600 Subject: [PATCH 008/123] Fix get_parameters when using main params optimizer (#6764) (#6787) * fix get param * change name --------- Signed-off-by: ericharper Co-authored-by: Eric Harper --- .../models/language_modeling/megatron_base_model.py | 12 +++++++----- nemo/core/optim/optimizer_with_main_params.py | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 1237491fa39c..2aaedbe5a806 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -240,14 +240,16 @@ def _vocab_size_with_padding(self, orig_vocab_size, make_vocab_size_divisible_by ) return after - def _get_parameters(self): + def get_parameters_with_grad(self): """ - private method to load all the trainable parameters from optimizer param groups + Get all parameters with grad from optimizer param groups """ params = [] for param_group in self._optimizer_param_groups: for param in param_group['params']: - if param.requires_grad: # (@adithyare) adapter training with pp>1 can result in params with no grads + if ( + param.grad is not None + ): # (@adithyare) adapter training with pp>1 can result in params with no grads params.append(param) return params @@ -272,9 +274,9 @@ def configure_gradient_clipping(self, *args, **kwargs): else: if self.megatron_amp_o2: # grep fp32 master parameters for gradient clipping - parameters = self._optimizer.get_parameters() + parameters = self._optimizer.get_parameters_with_grad() else: - parameters = self._get_parameters() + parameters = self.get_parameters_with_grad() grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val) self.log('grad_norm', grad_norm, rank_zero_only=True, batch_size=1) diff --git a/nemo/core/optim/optimizer_with_main_params.py b/nemo/core/optim/optimizer_with_main_params.py index c9790ee2a139..44d54a0e63ff 100644 --- a/nemo/core/optim/optimizer_with_main_params.py +++ b/nemo/core/optim/optimizer_with_main_params.py @@ -488,11 +488,11 @@ def async_master_grads_allreudce(self): def fp32_grad_accumulation(self): return self._fp32_grad_accum - def get_parameters(self): + def get_parameters_with_grad(self): params = [] for param_group in self.optimizer.param_groups: for param in param_group['params']: - if param.requires_grad: # (@adithyare) added to enable pp>1 training for adapters + if param.grad is not None: # (@adithyare) added to enable pp>1 training for adapters params.append(param) return params From 19a8d2f63a14d7a98d176c4357f6ad60e049d2ac Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 2 Jun 2023 17:54:01 -0600 Subject: [PATCH 009/123] Lddl bert (#6761) (#6790) * initial POC for LDDL Bert * Finish LDDL POC * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * address comments * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix merge head * resolving merge * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add support for val/test loaders * change to new LDDL class + add winding * fix logging level * fix winding * test fix * fixes to winding * add file system * add prepemption optimizations * more logging * more prints * better logging * asfsf * add barrier * removing prints * working with mb lddl loader * final changes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update requirements file with LDDL * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * revert adding to requirements --------- Signed-off-by: wdykas Co-authored-by: wdykas <73254672+wdykas@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper --- .../conf/megatron_bert_config.yaml | 2 +- .../megatron_bert_pretraining.py | 5 +- .../language_modeling/megatron_bert_model.py | 129 +++++++++++++++++- 3 files changed, 126 insertions(+), 10 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml index cbc0562e2904..a7e3364d41b4 100644 --- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml @@ -133,7 +133,7 @@ model: seq_length: ${model.encoder_seq_length} skip_warmup: True num_workers: 0 - dataloader_type: single # cyclic + dataloader_type: single # cyclic, LDDL reset_position_ids: False # Reset position ids after end-of-document token reset_attention_mask: False # Reset attention mask after end-of-document token eod_mask_loss: False # Mask loss for the end of document tokens diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py index e6abee295a1a..5f0b74db92b6 100644 --- a/examples/nlp/language_modeling/megatron_bert_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py @@ -29,11 +29,12 @@ from nemo.utils import logging from nemo.utils.exp_manager import exp_manager -mp.set_start_method("spawn", force=True) - @hydra_runner(config_path="conf", config_name="megatron_bert_config") def main(cfg) -> None: + if cfg.model.data.dataloader_type != "LDDL": + mp.set_start_method("spawn", force=True) + logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py index 64430a669269..cac1a50e98ae 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py @@ -40,6 +40,7 @@ from nemo.core.neural_types import ChannelType, MaskType, NeuralType from nemo.utils import AppState, logging + try: from apex.transformer.pipeline_parallel.utils import get_num_microbatches @@ -49,6 +50,14 @@ HAVE_APEX = False +try: + import logging + from lddl.torch_mp import get_bert_pretrain_data_loader + + HAVE_LDDL = True +except (ImportError, ModuleNotFoundError): + HAVE_LDDL = False + try: from megatron.core import parallel_state from megatron.core.pipeline_parallel.schedules import get_forward_backward_func @@ -300,7 +309,12 @@ def training_step(self, dataloader_iter, batch_idx): for param in module.embedding.parameters(): param.data_ptr() - tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] + if self.cfg.data.dataloader_type == "LDDL": + # this is of type bert dataset + seq_length = dataloader_iter.iterator.loaders.get_seqlen() + tensor_shape = [seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] + else: + tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] # run forward and backwards passes for an entire global batch # we do this inside training_step to support pipeline parallelism @@ -324,7 +338,10 @@ def training_step(self, dataloader_iter, batch_idx): loss_tensor = torch.vstack(loss_tensors_list) loss_mean = loss_tensor.mean(axis=0) else: - loss_mean = torch.tensor([0.0, 0.0]).cuda() + if self.cfg.bert_binary_head == True: + loss_mean = torch.tensor([0.0, 0.0, 0.0]).cuda() + else: + loss_mean = torch.tensor([0.0, 0.0]).cuda() # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False): @@ -404,7 +421,12 @@ def allreduce_first_last_embeddings(self): torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group()) def validation_step(self, dataloader_iter, batch_idx): - tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] + + if self.cfg.data.dataloader_type == "LDDL": + seq_length = dataloader_iter.iterator.get_seqlen() + tensor_shape = [seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] + else: + tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] fwd_bwd_function = get_forward_backward_func() @@ -476,6 +498,95 @@ def loss_func(self, loss_mask, sentence_order, output_tensor): # [lm_loss]) # return loss, {'lm loss': averaged_losses[0]} + def build_LDDL_data(self, cfg): + if not HAVE_LDDL: + raise ImportError( + "LDDL was not found. Please see the LDDL README for installation instructions: https://github.com/NVIDIA/LDDL#installation." + ) + logging.info(f'Starting building LDDL Dataloaders') + self._train_ds = None + self._validation_ds = None + self._test_ds = None + data_parallel_size = parallel_state.get_data_parallel_world_size() + num_micro_batches = self.cfg.global_batch_size // (self.cfg.micro_batch_size * data_parallel_size) + global_batch_size_on_this_data_parallel_rank = num_micro_batches * self.cfg.micro_batch_size + samples_consumed_dploader = self.compute_consumed_samples(0) // data_parallel_size + # We run under the assumption that the datapath is the prefix if LDDL dataloader + train_lddl_data_path = self.cfg.data.data_prefix[0] + self._train_dl = get_bert_pretrain_data_loader( + train_lddl_data_path, + dp_rank=parallel_state.get_data_parallel_rank(), + local_rank=self.local_rank, + shuffle_buffer_size=16384, + shuffle_buffer_warmup_factor=16, + vocab_file=self.cfg.tokenizer.vocab_file, + data_loader_kwargs={ + 'batch_size': global_batch_size_on_this_data_parallel_rank, + 'num_workers': self.cfg.data.num_workers, + 'prefetch_factor': 2, + }, + mlm_probability=0.15, + base_seed=self.cfg.seed, + log_level=logging.CRITICAL, + log_dir="/tmp/log", + return_raw_samples=False, + start_epoch=0, + sequence_length_alignment=8, + ignore_index=-1, + samples_seen=samples_consumed_dploader, + micro_batch_size=self.cfg.micro_batch_size, + ) + logging.info(f'Completed build train LDDL Dataloader') + if len(self.cfg.data.data_prefix) > 1: + val_lddl_data_path = self.cfg.data.data_prefix[1] + self._validation_dl = get_bert_pretrain_data_loader( + val_lddl_data_path, + dp_rank=parallel_state.get_data_parallel_rank(), + local_rank=self.local_rank, + shuffle_buffer_size=16384, + shuffle_buffer_warmup_factor=16, + vocab_file=self.cfg.tokenizer.vocab_file, + data_loader_kwargs={ + 'batch_size': global_batch_size_on_this_data_parallel_rank, + 'num_workers': self.cfg.data.num_workers, + 'prefetch_factor': 2, + }, + mlm_probability=0.15, + base_seed=self.cfg.seed, + log_level=logging.CRITICAL, + log_dir="/tmp/log", + return_raw_samples=False, + start_epoch=0, + sequence_length_alignment=8, + ignore_index=-1, + micro_batch_size=self.cfg.micro_batch_size, + ) + if len(self.cfg.data.data_prefix) > 2: + test_lddl_data_path = self.cfg.data.data_prefix[2] + self._test_dl = get_bert_pretrain_data_loader( + test_lddl_data_path, + dp_rank=parallel_state.get_data_parallel_rank(), + local_rank=self.local_rank, + shuffle_buffer_size=16384, + shuffle_buffer_warmup_factor=16, + vocab_file=self.cfg.tokenizer.vocab_file, + data_loader_kwargs={ + 'batch_size': global_batch_size_on_this_data_parallel_rank, + 'num_workers': self.cfg.data.num_workers, + 'prefetch_factor': 2, + }, + mlm_probability=0.15, + base_seed=self.cfg.seed, + log_level=logging.CRITICAL, + log_dir="/tmp/log", + return_raw_samples=False, + start_epoch=0, + sequence_length_alignment=8, + ignore_index=-1, + micro_batch_size=self.cfg.micro_batch_size, + ) + logging.info(f'Finished building LDDL Dataloaders') + def build_train_valid_test_datasets(self): logging.info('Building Bert datasets.') if self.trainer.limit_val_batches > 1.0 and isinstance(self.trainer.limit_val_batches, float): @@ -581,10 +692,14 @@ def setup(self, stage=None): else: # TODO: consider adding a ModelPT guard to check if model is being restored. # allowing restored models to optionally setup datasets - self.build_train_valid_test_datasets() - self.setup_training_data(self.cfg.data) - self.setup_validation_data(self.cfg.data) - self.setup_test_data(self.cfg.data) + if self.cfg.data.dataloader_type == "LDDL": + self.build_LDDL_data(self.cfg.data) + torch.distributed.barrier() + else: + self.build_train_valid_test_datasets() + self.setup_training_data(self.cfg.data) + self.setup_validation_data(self.cfg.data) + self.setup_test_data(self.cfg.data) # when using pipeline model parallel the final stage need to initialize word embeddings if parallel_state.get_pipeline_model_parallel_world_size() > 1: From a7403c26f79b4914a7735f64f9da60124342d9c7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 2 Jun 2023 17:55:29 -0600 Subject: [PATCH 010/123] Fix check (#6798) (#6800) Signed-off-by: MaximumEntropy Co-authored-by: Sandeep Subramanian --- .../nlp/data/language_modeling/megatron/gpt_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py index cf1de245d0e7..d7113e7cdde3 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py @@ -601,7 +601,7 @@ def _build_index_mappings( last_epoch_num_samples = num_samples - num_samples_from_epochs_minus_one assert last_epoch_num_samples >= 0, 'last epoch number of samples should be non-negative.' num_samples_per_epoch = (tokens_per_epoch - add_extra_token) // seq_length - assert last_epoch_num_samples < ( + assert last_epoch_num_samples <= ( num_samples_per_epoch + 1 ), 'last epoch number of samples exceeded max value.' # If we have less than 80% of the samples for the last epoch, From d9843331f7b0e0df536637b187a0d6b87181bd2c Mon Sep 17 00:00:00 2001 From: mikolajblaz Date: Sat, 3 Jun 2023 01:56:31 +0200 Subject: [PATCH 011/123] Fix validation with drop_last=False (#6704) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mikołaj Błaż Co-authored-by: Eric Harper --- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 8ee470d70a7f..fd1382e668cf 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -455,7 +455,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): # TODO @akhattar: add num_micro_batches_with_partial_activation_checkpoints when ready losses_reduced_per_micro_batch = fwd_bwd_function( - forward_step_func=self.get_forward_output_and_loss_func(), + forward_step_func=self.get_forward_output_and_loss_func(forward_only), data_iterator=self._make_data_iterator_list(dataloader_iter), model=self.model, num_microbatches=get_num_microbatches(), From 8f26d838074b2a1b8d4d85fb3fc7c63453a2be80 Mon Sep 17 00:00:00 2001 From: George <37293288+Jorjeous@users.noreply.github.com> Date: Sat, 3 Jun 2023 09:19:03 +0400 Subject: [PATCH 012/123] SDE unt lvl comparison (#6669) Added a visual utterance-level comparison of two ASR models Signed-off-by: George Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/source/tools/comparison_tool.rst | 31 +- docs/source/tools/images/scr_10.png | Bin 0 -> 42125 bytes docs/source/tools/images/scr_11.png | Bin 0 -> 50431 bytes tools/speech_data_explorer/data_explorer.py | 547 ++++++++++++++++++-- 4 files changed, 537 insertions(+), 41 deletions(-) create mode 100644 docs/source/tools/images/scr_10.png create mode 100644 docs/source/tools/images/scr_11.png diff --git a/docs/source/tools/comparison_tool.rst b/docs/source/tools/comparison_tool.rst index 1e28621704a6..6e5d28a0feb3 100644 --- a/docs/source/tools/comparison_tool.rst +++ b/docs/source/tools/comparison_tool.rst @@ -1,7 +1,7 @@ Comparison tool for ASR Models ============================== -The Comparison Tool (CT) allows to compare predictions of different ASR models at word accuracy level. +The Comparison Tool (CT) allows to compare predictions of different ASR models at word accuracy and utterance level. +--------------------------------------------------------------------------------------------------------------------------+ | **Comparison tool features:** | @@ -12,6 +12,10 @@ The Comparison Tool (CT) allows to compare predictions of different ASR models a +--------------------------------------------------------------------------------------------------------------------------+ | visual comparison of predictions of different models | +--------------------------------------------------------------------------------------------------------------------------+ +| visual comparison of utterances by their WER/CER | ++--------------------------------------------------------------------------------------------------------------------------+ +| listening selected utterance | ++--------------------------------------------------------------------------------------------------------------------------+ Getting Started --------------- @@ -151,3 +155,28 @@ In this case, all points lying above the diagonal have higher accuracy with the Points marked with circles should be explored first. Words in the first quarter were well recognized by both models, and conversely, words in the third quarter were poorly recognized by both models. + +To compare models at utterance level, pick it at top dropdown field. + +At the next field you could choose metric: WER or CER + + .. image:: images/scr_10.png + :align: center + :width: 800px + :alt: Switch mode + +When an utterance level is selected, it is possible to click on a point on the graph, and the corresponding utterance will be automatically selected. + +If audio files are available, there will be an option to listen to the audio recording and view its waveform. + + .. image:: images/scr_11.png + :align: center + :width: 800px + :alt: Audio player + +In this mode, filtering is still available as well. + +**Limitations** + +To ensure efficient processing and avoid issues with memory limitations and slow performance, it is recommended to keep the manifests within the limits of 320 hours or around 170,000 utterances. +Exceeding these limits may result in both memory constraints and slower processing. \ No newline at end of file diff --git a/docs/source/tools/images/scr_10.png b/docs/source/tools/images/scr_10.png new file mode 100644 index 0000000000000000000000000000000000000000..71c378efe57bbb800029b40a3c764d366ce6a930 GIT binary patch literal 42125 zcmcG#2UOEf*Di{Jq9P(9C>^OHO?q#N^rE659ciI=0tr=6QF`x1q=`U6Zy}%}H9&v} zp*LwU5G0|8asvPNJ?E_Ne&0R!u6sY$VoiQCzs&5}Gy9o6&))g?R9}Po8q+m0GBRo{ zO||D_WaJ;o$S%@dxlF40T4eH*R9x_RuJMQrGRT4Fq1)q&9T`~>rq-dFdEPEs zap$7rl-1PjMd``6uU^r&OX{)fwf)jdd#{!?8oQfsYWA_>?WY}v4{lla?>sZH`D|qV zwCGySlZyLQcj-Mo@Dfj30ZQV4L0R!tIUivdw3KZ(M|QRUpr2Gr03lb!^BKMI{_l6F zNFtJvbtQy!C{dn$y6#P!Xg~Wvc9Z+!Sut|u)`hdOt?TM7G_bF~7`738Q$9^wV~L0@ zvYi%e_J|BNxfHAe(CRiNa+(#`e|+Ax2bMD+v?q(`eMhEtExBQzV``yieMmH1m`nQd z<{6=hp0g78gTrTKJ=UA}Yhm|gosII@@YdEk5}NCn9G*R)4GGmrr;+tKDW~W>kUQIV z1|jJLpHf&pB6qgBPv5s_mF_4ggVu0W#~J0?+|vfEkaX%hn00T4)*FJi73Dpk_Cif8 z?kydmR%XAaHwTj-DY+SXtDzW9P+cX@@l4L#ueu_FCgJspa)|U^pzQP}$Z04IrIhTx zw<#!FT4~lIEV=0isjW=XnOS}ag(ID!*ZT*HF$WZ(0(`kyH0fH#dn~f0eM0qT6H7|; z)Uj!-E9TASa4S>@+0v%dtVoFNC%oA6t63iN4f7q0NM7>Fb!XG*8c1}JBoqDKSMR1* z^HaZFlR?(Um;9bQd$4(~x3bk&uK}?iN_uA8D5GYKo?DAfS6%(oH$FIv8$jcz%_Pn# z8&Zo9HOP|ollSTO+I7{M32x4r8x|^ql!-lx5IdEY?vvy{9$4`1RS=0E(@O8n$DZ!W zXU^5uQvJcWCNAH}-pd6Ir0b|pN?)f=W8ZX49D@x6)*?Mi6erWX(>vOB{VH!~u9j$^ zHPSDa7hn_~PwFbwH>`OBPS{BE4C_kqOc>{6Ma_`e{p{7_{g!zT@FzNPwKv6U&<YCD_nST*fS=FpxJK?ir3{klkh^ZC{)YW6f(^qv`WkLm*@V&CCsT=NcH6~rYQ^E<$BQDtO zmR6Q%yG-dhR-NTE2x9s3y2|~S&z@i{v>Jg?I8az2;1X}F|a(A7cDinjq)cJ=+_c9uG8 zSZCWEQ2EEC%(++gt5Y@4OMHCFvGdzbJ^Ar|B#dw>!0iJHxOu~3K$&2-X9M zyx^7b8>1sfpQ(fV`e^-8W|T;~F683S?^A%==)q zSqAd6uS99+aGsUl{%4tRzY3L8)*~OQewaB?Q5HHUn;+>7;QXH~vPR&8=wTKH1+!Jx?aTqJwi? zn?r|wG0Y(D>%*m+UOVUo~Y&V+gtVy(P%7c zDe1dXn&nv@NaYDUU>OCA%jpqfa;d#o7UYJHJq%OxNK(KTATEy^h+gk--Eh zBCHIjRTET=X7x`LOb0m}K05f$Ix{lt-CWBysXLo1I~6j!WZ;7E^FAgW4)SNKayk5M z#7_oH2CiW#R;Ia^LPmT^H&_&}kSR8pDX3V^poJa9d|K_tpg$H_rnPk&MFr%eF?1sS z`RlKQ5N?PxR911N%L?JZDS9Hv375AqXh2>+fnn&9pu2v6LMB9;>V2!X$S+Oj`m-mV zZrfz;S?yLYsFF??h9Hj8nw>0mr8V?ZvAN$SivleJo@O}Z3SwK#AzR^61HUn`pERfW6>CezB0BKGjN4vt3cHQQ7m5_AB~W zYwM6n8)N#(sO&8J&^=+N`eZ7M*`#n;x!X5t()X=Z%0LNUgx*?Mq74fHTP%qybUA6= z8tmq|d2M|_Y1Ror+m=;Les z+-SBJ+jb{#IVc$&S1(Y62u{l{K~SWZW^?n!Qe1j<{zxP&g@tEnVtPXU5mf+Oy4`N^L`UnZwN1+uZAMbC*XUG}I zNr7h6r+6fV4Xlv0e2BXqWK6X=@&4N-FlZd=y8{pryy0UP1GHk5eF@oupst4?2OkVr zcCQteEArU3a*u*g*G8k9nlKBTE_$YD#Mww2is>%DR3S)(&VQ^CLA zX5KyEC`C^QrBqvx7c(!bn4KMKWTQhFXl|*Pc_H3u4fDpBo;La?+is#~EuRAF{sn0N zF8^DAjT$4pP8bjV`<=4JZAxmQ77)+9FMc-u!~4{XVrp?Ri&tbZMafIw$Z6^f1(F>T zn(Pq_vg=hHOHIGyY7NicB=Z9Aj2bWJ`p+hc+IEfer)t%~${ILsah&Z$6>o`^k#Pax zkC#9Cik`WpVN0Eifuf#gyNrb4|2a5YSzP`IJwC$XJN$OvDR3ztVNVm#>*e6BNo45# z(`rQ9m!~;|v);qBhm!t5|Ivqw6={XozrbRaN)q?=}$@f3EtppQ1v2;OLA-(t+o8z z$NUF}NUD0<%Wa$e##U1YG9#4*EkUzEsLJBA8C?E6)81z09ykvoKG^e9{0d9^S$MjX zZ+9@4FJOfOYv&zemDAfn&ILgSwLW$ShNs-2;Fzn8?k9PgFwiDHA;7wB+Wn$V_hRG> zM*6gjCAdGoPZ>izT)WjgLxpceW?-ng&H4OgRI@JUimPq!+Tsf|IH~Jup-Rvm5d+gCt^e3FB`mt&Q9!y zpFnSHsxK@~*1Ie>Ot~Wh*6N&{zPa3L7o>t4$pB}Jy zgKC8AR!d$>oRU|a!Y*7s3VRLQ@5p$J?capOr6Geu$`yhL9h^eEjuPs*L6GU@hXF#| zq#?uJ2#$A3jPKOl6nR(y4Ey4Imzzx}RL-_;&To4m86^~=k?&LB>USA3{BR@Ld*QCf zJQE=|`%a%@JHlcAlbF7-!EqbtG*Bc?cqqwnWg_pSRorucF_a5&8sgGI<jC zN>KSr#9VO7S$-wxUEuca7glJ;w1W;J9Bvj1-@LAhcG|@TIn#1{y?6XRm`#2| z+AR9rxiNj`9vm;f0M$B!EHdtQ9JFWUJeTJA^ZviDV`>pU?>&A$4mfKJEBeV8$iu|V z=Jk^n|EC-}X?$MWvyU#5?@YP(!coHl5vW|1V=Ig{z(S8~>k+^j|fffi7r0F96@Zehz6WNhkB-`3=Yi zBp5qK4M#y==`&0s`}gGjtDgQf_y0q4+qptvG)ns_4Q^AsfxGL86rtx6x?D_BSr4rS z%RRna-z1iR`8Pmk`&-nv8eJziBUynNEdJA8z_Y798HYZ!?eJ?WVNhPoq5rOYvDinT z^eBVc#d8H{#&wx91$Ocbb0i9voV}gUDq|~h8FGG4{;%dxZjsQ0i^P{@=U0pT632HK z1Y4ZgrUyGI_z+5$BC)=QrAvkvi!l_07=Tu1xP`_B`jgN&`vOrj<214Mgsw z_ZQXK465bim3FF9m{2&jE3apcvgB;{s=)tV?6e{O zS?_}Gb8DPQ+Zu8jeU9RIHrVvEtwJbr(m4QsA29MI`3YOd5#vP5GQxX-aj-}eC;1U! zA4fv+hvz3>nA}o52UsZTT~6@s#9&AAv7A#q^({BV!Uk1i-@DbOjj%H~=y}{YmDXv) zA`}WHJfi()nV$l}Ep#I+45+p)b~sah_}ZoVdCGhL)nD)!A4YpJ{L6(!ExufCO16Mcxu%(F5$F3_#rE|))7cwg|1T~4|CwF>N3&%f@fcs- zn(~Z4+F6X5VUi#`FO(1dsK3@S5yAnS4oI@P{$qe zLb%nrp+7tLUH>7<$9@UsV($egoo;OVVPpR9&1)h{ZY+wqsD^rp9nOY!Xol9@p< zxw}Lq6EA`?LS$Cn*r8D`k#6bn#P)prGOk5~|u zq~H1yw6{@N1pg~nXQ`CgDs@lkw(Q>?FFZiJU69Vv;oMfDUY=(CJXY_yv>HbTf*VWW z#;$tw=0zZrFrfsY<^`B>&(iDQniZUVO>e8uDjMGZ*@M@o=j{{;(|L$wcbyBzB zyr7M#6@pvngVK#DZsp?%x*83@IWT_F)VmCk{d*J6w)Hr14qG1IS1PPvrcT5DAd&^& zzGMiFPcdK>#D7Qa_4sd2aBElvy$cH5okKM8x}=Xd ziYPa!xPu&_L4Rsa1?Emj_5B8u=T~hq`tnB!`vtAUv8fus-f2%4Um8N_v`U3-)<3KN zNI5@+d;k_WJ7n}E@$n6%r%D0k#%yy56G;I*_Z~m#_m|$B6<|M_4LYer5Bwru3X9MU zYj&wT;$058aVFu@=HpeSUC1z22M2oSO>4Eo3ws!yWjo36Sq0ttSU7vYvvxL8R`}x zCMj`;pjMv9>ANv1(K5>ZC&Qv~~vtRDpshPv%r*?$; z6izqUq_3N{KR3cD=<{Z7IwBVRbCDG(#_`$QrP&j%O$2oi5Y0fC2_=-4QTvA&hm65= z#FDElBz zS6uAagHXdZ z*@L@`$}?_Pe?bbO;Yi;QLv~I2X%!WWQ#3lZ0YORI8rSJh>H;3 z+46>>8XTQfcI$GM`{xvdCxxTkjDJ}OG{8`jtQVMZ=7O+o;^FM}vzqd!$k87AziFqN zOMS|6>fpR;@yVTv?U+_g-uK+>XA76~A{$o{G9+m!jKA2t1;{jRma3St4zFi+3j-Aj zO4Cm0^QFnDm+0Ogs(+N=m?{>EkP5SK4e^L%u3!~E_?#f&mQb31wTPHu(35#7ym`&O z$Na>0Owr|$QF0Lgyp$ns^GfP$ocqrn$GHEgdjP)RuqyrAh<5jh!qN~mpEV2>)zw*R zRr7Y0HtW` znbG>OJtLnyt5ezeS1c;%qgC?xjX~4W9L>$k^ba$`L^Y##?)LoKhX14Y^7R%sE|^FS z!VdV`ZEmi=o!dzDpCt~pJq+241I-1kY`_M{32U(1@wR3rrTfRG6qdv$7T`40 z;#{RR^93o#)18rH^)I~n8Z|Gj%gGm={n)ru^c#Pg04f40`yIkVY1BqSP%UR`B33h@LnpuMzI#ES|-5z^= zSdom)wkksU20dPOjdwziu@tU33be%O5aBjZb9L*eCjRrl0IhdX$ zaR-6VI)E(%BJ3mem4k{{WP#AuddG->t(O^()uj;kx;I3K`9rQsbAs%$)4JUWPd@YRG!L>1xmg}%}Qq{P|Y z(p2$QU3?bGQF?Nm>b(sxNGKeb+X_{FlA~UAOp_eX0&2SMrF47a+vE4W-y_-XKUNa! z4+_a(scG+FRut*k3?Y7Ae|7k!mAFD#v)4Sci{e8Jr{@WGsOJdig%T;fzvO0NE`hM1 z8Ca%BOXqO2D|Q~6#~sI~ap`_4{|+JCEu$4V6aQEK*X!nV>Bls3D!Zpg`F&;L zC-FIl8&Jf4Z^wpw9{Ey2N2EGMm^)x>F1vdZxPH@(rT_2j)X67VBD=(k_aWbp-XG+~ z`yEy0fU|--CQmQpy8Qw4IyV zj{Rx#`{29}jd@_BB~*R_S1THM^1R{1J9*t~ifY6r#Z9r_!GVyBYGjVanP4UT}`W3H-c-6M|E$zHA%=qd%s51B+kZ?`-eG>9MEVwuj`!^!O)oo%JvAL3Q#Urt}Kx>A(EK zq$mG5`}%K}?Y|&r{}0V`XA!;l(f8EM{~}#Ry5iYA&fTu96hr-L=!3llT1xEP)@7*!0)DTwU+F2_v!yWrWf%YCEfdSZ?i$f z9P8eqp0+Mw&>vVDa^$<`g#zylPJ;9ODrvemgmRtdHhbwrn?e+L#f+mln zDl!G&cg8W|42WwUa-8dFJhdg!(+evT1Z42z!J%H>yRMt?ANpoI$!SQx2%ndcAl-9i zU*|PY=H@2=Uj(t6qH8Xj`65SpV*bE# zwOK)vMyX1t#(sPpY3ph$`1A1J;?9rrI?YZCU`Y|SQR7{QhF1fxT+N!NDZg7E4Q4k! zNa$|tsk7Z+_WN*7oZY>xu9ciI|1>{$2b4I)py}+I9CLS+dsQgt1D5)?=Q>3ux6g97 ziG{jGj)bX^h!t1xt{xr#j=X2=!cy?=w%mYixEh(` z?ZKhtJz|G*pir2*i#51K*|%BzN0xt=Sb&j?qQu_Y7=hK~BBy4pwEd3IK)}PyMat9M7=Sy>QutqfSv}5v5*Q0~!3(HH=O)fiV>ksuD=FQ5hHfy&&Ah1M z70h=m&9?VzqahRJUuB>zAXYgRuXm$o>b`}HDd29x=K7US57W1WIL9=J-Y^L6uF%F{ zHL~8bxkn-QM0l6%;ia4^5}1();F9FcTkYUrm6Nd@Xpnz(9NAwOD~f#7M$J z;to*9!g;iyRyg`t{sSp6uhNk9aI1_;?sHx+(*rx1-3|EY zA2urMr(X|+IInGVmm(D3%=toiB#2vY9weHn={{6#3uY3Qj^BB^WPJupv5u?9-q=SQ z7wxT9hiKQwmVkjH*M_oQ)|kTnI->KwS|n#U&E)#b(C&OzyWXwz)hS#s=;uaB3yv3} zMRBw7!7S(J`H<-%`$w8+?OJxIB0LV-kKOwk5dEQzc&Fz_D<5|U;T7t+516_l)I@#f zYzACgEsN$&+PsQ6Y@$N_jB`U+3*JeH{K^80tA6?fIQBiTvbo9_(q`L2T==T#k2y#Z z9Gv43u#wrNwaO46NWnm!E-0I_KRl^qm)|^?=KQ>N9I#t?yY;)#v^z0Y;J{2RAl+03 zT_w-etp2gbHaYFP``MAu?cDL6+Pjaxblf;KaN*q@n_R0ACLwr~f<6f)D=t{^ZdWXF zk=?~m>XCk##WcA0w=5T+=FnX4wBP^(p*1GLKCwqLHh7FUmT$At- z_)0b5H;q>E)sPts_+$8thKn2=OQ_4P)Pu$;WwFsuWgB>XP3@M8!bDL$*t}u>9PB?U zVm!ky=0q^xGu2i`ekRCdq)!fvDSCayKJZj@;Mj~VaAH#qX$^kKXnKyO7h;Et=^BTa zp9Q6&e)DXuHG-QpdNd(e_pJK@rQ=w!@<|lm9TGPo5mnpv3}y7QZ%r?|d@kw=nP0I3 zaqGKyi?3*5KJO3qH-1ey()Qs+4ItZ#ODkUcnh-E7bsOLk`wyz=-nEeP8sJ-|+4a`& zo)O+j=_J_!iSFR|zBZy|CZg<)%?7J=cn117i zj1cKGg%9~R9?i!oWJKO@e~3MK6&YagB6r2F*#^oELx`|F@h2GEZOj+m> z&=-tH(AAxl-!;Q>tuP@saGK1_` zW6&je6_B|q$r2fMCV6HlsAYWv8Y*^uv#a`W zPe{d$>sJ;UnJDm|T(fqwbtmDzU=U-ouDd|Y9t<;<@vVm856*7Hf+=^8ltgwq<7=8@ zbh}m`(x-3m>fqssQ+(N!Su7r!vB7B`1j@`~y9-uMv1qtREYMu>|Ir}dDEUxPF}Fd* zats4Z_c(dI+>yD`DA$4*K1hH;e8cW;o`hhK*MXg>&FfB?_Oq_cgL6zQM5eoEdxRqJ zvT@|kUCP^@NZjtsCN<2B?Hp-```(3EdX${f8G1=j4oo%lDZrg-<0?u!dNwj4nm~FmcI;G0oB`;1_Hj>6Tid4X!4jr?ko&CXcID1(VCH zM_sP~4YjCPqvk4Zc?1MQf4tl%*vImmMhVlUt`@s)N)9g>3#?0JG{-K{SI^|GmVU?P zp;e|VKK#Wm$o*!He$wxa`4|WC^M*xqq2{~WE~}7`n_9%vL1O9pHi+)4Y>NuxQEdmV zLlLo=X+7eR&+GhXHgW9{Apig^T8Gh7Eve=(*<9I`V~c7pIWX4Gyw4>fIUlD1X6B#$ zHjPtgUhZfs=nrX5i%7bDojPQ_+=t?$?3m8VeW$}i-$Nl=vLRu)?=Es*)u80o#EPIyJ>X*0>pWn*VQMr#c|VenYuVd#r~rD})^#@=9um0- zbvQ|Fuu$Wd7KuO<%GMmC8x&7ozZ3oO0sLRz;NAL(CC!F zZ)k|lzy_ULBkyKWTG=N{8RM*6VL*rdWNKmY@ekii3o%r>os-xS!%p&$!AlBo{?X)E!q*M+;^&I;sU7q302V()X|aQA;r zpb=un^+I>6-P5ONt(tzAD0w8%Z-@!b;>A4mN-it!N`5{MmzGmAOZD)V=Ga)h#J#&? zdH;0?xPg1<)C@-Fm{3Z4;;*(cvEmr}%h=o+W|nbH3;ebM;~kG;w=|=j5l@-4=YSLR z4E(cOd}z3fiyM_va#u<{lFAz-GpYR`*?dy%5eM0WMBam~ipc)Aoh!8~93H?=8!KT8 zEB;P>`p}YP{Y0&ji+?KcSAFbbQSW$KgLFfdm`q+G3&K0kP<2TR;zq@e-NW zatAy-0{iz}<=`zj8)BlR3iP&5-TWoRtnuUg>G@CuU6}o<*O*d;skRQ3Eq^sCsB!=J z>8(d7^THzm^#C@shUNQ@f^_6wu=2#Dk?dzQRjsxzf8jAsav6zZ1JiOlef-L)T8Q?H z(UBbA+|Lc#TF|lY;pkri?U1`k-TKo75gxKc=0+v%Q@cN&Ry3AoP62)|h&4C$2>V;V zH_H3252dNGPV`_4E4WgG{vtWviV27WrARc=iv}(UZcHx5e%)!!8}L>)WSR~yOa&Dt zJu$VFN)Z?Ui>fYauU`LaG?C0ZSW1#UF!&k=}# ze&e+nSr~xb&-KEFRtofJImV$hHSe=VWqC@8WI}`We7k3AS#%P(lSDEQouF%B!xziu;N~dVR(`%-^Y4Wyi#KH zwTt?b){WV=3Ip)1Mty-a>cmi0znt`EmluKPp3my7lDF6`z*RlT-IEp#c^b{E8LEr= z@C1*#G3ROi&PA*Q{!7L6vIF((o;h?rasC}%dv2ftCW?=69ENOFC zo|6wR>Gr$-Zab$>{D;e^dODk?Hq#Shk>Rq=`;DE<8LBp(6h)gKbn)5uFBTeLhMIuj zrTb1`#q`V)^2rLIy1~eBVqIY5l)}1iGRk2;sikzSv+TQ>>rYiGQ}9xWM6UbDt3ukE zkwV{0)`1jm^`2<6fP*jewN{DmvIneYs-yWM^i^Kk98kqTjJCh{*^4_7GO?@_{|IycK_!HN#6FSJl4&Igg^Q0~SgBuY|DNyAp+0dtq_)

?FkxDtA2MG7#8Pl8aRv@)gE{(COd|$ z*bXy0xB?&t{VD^ivhkH~Z+F#hybya}Q%QJ5lR`p_eE+qhu)_y*of`&pT0WPstv4HB zoK8<#`V+4d49j-K_{ro3Wg6P&W3wTD4Q%!I1gw(ryn;@6L!{!Qd)d ze1L;u_h+^yYA@j?Vu^ao-Fbdh0FM#EP^4lYME(}QGjAK7G>ycn%IwPt*(%KjIw*bn zbgOyeKDE+%U5wj+OTBlah$411XSD6iT{F+Xd7!<-ovz@RTWjN<5o2PGRk|4Gw(-Rt z$H@|U2$GvNcNze~G+Ztzr5wR5HS2|1D>F1G3uWP4;GoNcLnUT>BL}35QF0m)<<#>E zIu+=K4N>u&J;6@|!%dWD1LZ<3-VbI^g90}}?b-aV7`X6V*=#a#U*<)5o91Xv^vAd7 z>-v!zsin7m*37thqY57P1;T0j^y0ae{9|K+uCNGnG)i(+=&Dt;SBWiDC>!$TAm@DB zVKKCoR|>G=bw~Cd{@;T}E)4T7O@}JGn*Hq?yLGCV+ioHyzNw$BzX$g{vAV4Z0a=Cay)x^rzR81OJ^HcB2xk2nSbj(Ex0(rNTKEY~9>y%yZf=p!B)S;B zcZ~=$BS$w20@b_xN~0@|>W#?yzbm*p?uzY}IyvMGQn$M01ys^H!H4iEsk7lW6Szf|C}p3(zaBG%AUdy# ze)zKCi_9L=(0A@8OS;p}E0u)NR;kcjZ1%`>@WtD*6|{;QMwpWU5amTw5Vwm+W+8rRIb)raqVtx|{{$JG%R`uNMIF{TZP z1=aDvu^&0AuW7?N-wSLrkU4ZO<_ZbyUDuq08h_Nyv_{!n>=h_QcXn2*91l&!P?*YI z1o&cYG(As&oZyYe;G5yo!CmfvAMErdD73!Z}SNEHjLyQ%y*h? zpo}}m=L=DWnAp2gtvSbK2CJA=tH~A(-Q^#%E9!(z5xsXK2;XAnrp%jDn zPSutWmX;bRe6&CjiuZF)|VN?^)yPdB7JXF(0I;)%M zsR;^HbGW*f(z%vz9oKBFY)RWUJixuW0DxH076+(=J;-o3b6V#rv@tb!9KH&qN8EO- z9F8)o~HP?WD`~<7_O}*y8A_E zLHKw=1+50Yjz^~O5%m?A>|x}17gDZJRXS`YlguGuWAeeduRi=u^|KW7R4s+}-4`?0l#u8v9)+IBq#}C%#{u-MJmCabqmXcuxXv$>G*5#ir&F}+ zPd~O;Jp{OFT{_IP;`I9KDg7}w^^2U56g14ca!w^Ib1j~AF}+`YQZ^%2{HpP?P8K}n z^*N4>rA!v-)wsdtq`mv>qOJi>je>oPF<)RD0OqTMhGwB5(|O8H1>pzI+cWJd3=G%8 zAFnCN#WcKK>s3{BP@A3PU6fMFBp%m&!3?%7=_j`LJC1(1Z(#q?2}Ur>d~lPrfB|u- zqBCMTku5hoP18LPa_>7|oY+h-^dAlkeVektTocjQ2i_4Bz!r-SC0VxDQF;E9{G#Rj z*VanQ*<8bUnq$~9dp?gxGv0L-Sa8KM`=*zhCH<2bo5DIXHOD)b30|cFab@Ouz2(nCtSNTd4DF|e9bc1P!WTB3l@6b_H6m{d+PXB`)AKC zA|Ql1zUb|y0<=O)yDk2A+`MSfD~DO|Ms#kY&%i5L3$6hYddO5o4x*ZshJtv5U6{oO zkP#oI_9>J$2N6c=oF(#aR)2i+%{EwAE>oy>^{>9(nSTt!w3Z#cD%+v)xxTw%ffuT+ zSQmzyUWp^qaV+a@l6Ljbzx zWr!6(hZ35htQtc6;X>}}XyaCV_%SAvy~o%s5mu`eVu#lg5Suk|5kXG-Nem-(l;d+j zmmr~ArIdtMuxm?m!wm8gUJu>Gaj!&bH8Op%$KnI}Ef%HnSY4WFTFl7u^$?|}Z6o%B zf77{4`8)VER;;R~!3TAUG6UC$L9|ZSAMn6&hk=?|x}CF%k5?-@65P6ffyD5W3&BcF zY}e8wK8=i+v1@DljE7Y8^&xF7prX^epRUIm32WWiSlkW*2V$UhnC0caK4JRT#X5kc_C?kVf_C5EP*sl{Gyi(JW_}r z{B3Pn@;mlJi({h1K*S!y-IiNQAuY*eIBFc~&Ni&_Lv%FD#$ZKVRU#qgFKvpxzepC4 zP41YlMqfk?iN>2Bg?^LV0RicvuoI{Ydf@V2JvS?Oo> zzt>nhp`??cWWTV*{z{A*L#vc}gtk8`px*mk%+oa7pA_@_qv<0p>rg z^M^*VK*-cbrqt^<6d9i`iZy-xK@NL;;{ zy7%{)Q|KA*{-@EpDgG~(dfi%d(zIa$Ml(6D5cnBAxaTfC`~pQk@(WhduPTR+nYTCV z2`}7#nVL81#O|hnMj6SD(oj$sCLilXZCUnV7=`#|B^v%rg$1&4R5 zDhZ_E>6JnqKFBc1sc%)aA?blpcPK$LJgHF?!^i*#7f{s3d@JGdE@lI1E)}7cTT3jP z{hFty>gs6wuEoz6158?<9X+t6Bo5;;Iy^zZ^_f&8l>Uu9Z~8-EWQP1hyR zl1KBej^3=^kij7C=03D}Ax8RXZFE+Qu;Q2NG0)s7UnHynoluWlH2i%I&88rrtl{s^ zl(^Ifecr}?TDrdATB(ew!D% zB+*?HKSFK~;<7PH>s~%}e}9Yc>A~_NfsX}+7NM$3C)2}CgtC{iW7qyfQE6``Dcx^? zJ)FQ!o6OPK)yL_6XG}|T z&{v+8ypEm9P3%BwBTtn>fL0S2>bU(cV(LGIrK||U3N(zdb)OaDG5}@T6PI;#Jamb( z*IuS{OUmq~rp#vasT7l}+uzra>~6~TJP@o7{osqJ3Kp@;QofnOX1$|ZOfVKnGRS`k z(JnV{oAaBc%3h(qwk(7wAu&$2z^C=A?0EvKPCQomXzWg%Y33H*%uIRRrE;IBNPDr8 z6LSHs&HPGmAHLlZlZV?o@?rzRohRK}lr7rG?Ik5F4nfJgJS8=`@~0F5SUKBZgLKB{ zcG9O`)0wHSyMf6sjDf_}bvzPhj{N>g9fx?@?M(nMQeP|ia9Hh2&{1v5a-OAfu2*@} z-k(%6#5qi@pQdS>;qU|E6)|$CqYZ?f?ds!$#_!)NzsCnt$8d<6J&tF)Vm)6&&O;wO zb@RB@A3^Ig)T(E8O}BW6k>kNSkpF7*k%ghwHD>3TAj?Rn_Lnr<2B$xrRQHGPbUj(Q z5M_^_YEJn9A*a@K{d5Z~&u3VzO#`x;bX)}Uhr{B;C@<<5_(v}Kk5$uxn2V}jFY7(L zD;3n*@nX!!-QmmO#3&+5AM6G1lNe_q2H>7I00r74@(?YGx+=BEGRBf z7t}~iinPt$!xzV6gVR^W0GI9|-AuO+)3tb2WOoEO_dM~!Y*}+BZh6IkHXm85-R(kB6aVE^S+T335k z9u%ZPuxhl`5>@vt)4#ZtXpQA6RJffDmD|rLLa3@bz&@C@iG4-SM)PC6V%s0eNbXld zFOYJXR5}o$Qwap#7e2mk#NJ-7&y)IIG^f-_Rz;!=01%0(v0JRYldCxxksxMn58u`4 zh2Ocoqx6o7ebYFJPcvDgQH&$G4S#;XvgHO3TP3o!{XN56O9hJceDwda{cz!dya@0& zq)&prNxt#XUqtxXteEpIqMjwi&f1aWKINW~)#}gt+#c8Aeg7x%8n7$R*r-z$HR6t{ z!IgX*30xM!7YaOSyWI-p=A(Hk0jOKJRB%Cid?P1k3_i8kCe=3^I^IBGhf?7>lG4PG zVhEA(P!X;+&v!j;RNVGfutuGzD|i3=z~Ivjjbs}e@d0S|KZPlfF0H;^FXJerisZn+8haFy=QiHV%TJhn#cXzYdOw10ly}w# z#d`jWl2w#7#MIY#ah%U)+wJl8WNq`vx-UJ8FbcvVv~j0I;EF71MoBgvsL%iKt-}8s z-wLM!=9g<-LX^%_)d`N*Q9O_+X!Ma2@X6S zrL99md4=z2#U=i)u(HPy!nL@K@{wZ7@ec+s{0-ADTMxK+lkfeIAlq8Ww)QvF5Iu0- z1)Wq<)THnDW-4&y(zx5Vjz~oIi+_&!^A98I!Ph=e14hU=x|bQnopG_8?r#zk)ODbo z@yQx&e9>Zwq_hkWxd?o6Kx}r3{za5nVg;#{X$VwN89E{DfObCtSNfmPkc+dNow64` zp?8m8@Cu#Utn9Sw`y&_4Iw`%`uv(D6W*r%P_dpeVk8)MN=LBctxJsjsP^6>)#q~6q z&dpPaf$58+gtuR1)`+AAvNT$biyt(Wdh&Q>Ui)HREiubV3&y*M_qjQp<@u}huUG9f z*u2=iQb_Lk3iQeSVT&&wzvKmv_tyC*w?aLn+ft9p9elQsp&?`Q7+Bz2sJA@4|e zyW6p$ol~Y3NzvSIpXg9e*FbAe4w|{*(+**~EmGO#;SBJVst1(ZQRj6BZuq0P03nxDIx4PSA;H~7pU~a;CH|OU;tDA-SBC>mm zeKJjqaAlUy?;FDd&$8s5URVoQiy({ zcH^CUc4Y&PQV@_?lz#1nA$$p_lA|#FwzWp zo@^mYs6yD4?93<|gQ0bV;DFB+M}x;BrChdIXtJl*sFm96c6D$kL(GocLHW0XvE&^T zaqv4$D4Zzq7rp+xe@nj<6y1+tqjiX_6KQPJe%N09m)3u-9#?V+7+Q+j*m>g+Iyo+3 zjWYW^zKq{2U`msI6SPMernRTuE8B%pI}P|9#BE(&w`aM~&lOy8^~}$YU?oBQugo?# zGyAU8VqddsIhO`nUgJT8HYS;U27hRapR^)(a`f6rEsoabsW5-;vl7JU8!{A?6xVeV@inCnJa6VPx{XEw{C@X>k1g z^%x@R-J97v6Vg9DUdW?m_%6d+%ha5@o35UKS-nOaO*Uc{#Z?G0mr0T4r+|h%NHsF5 z7FU89j18En{nRYv!2>$vHStpAn%!3zdQ?V(tE9A6$q?QPAW8;%B7aaMbiXE<*Qdd? zzi13Z9o5;s=We~`wP88I_JJqcz}sxD&JS9ZP%y-bEhZ!)@8BGqWx1ATjQV<9p0%6t z`?p!5?~&b~^OW_7vh$r1xxYR}Sv@p+%Tjt&t^iHE1nCuwX1(`67<=!yr2cq+yt1<0 zCrwQ&ElbUvduz+gg}L{rMBD=h;>^s3np+TOx%U=S99fwIm7J)En2H-Lab!4t$GV?; zfA{ye_xt_vC+ax}Z_ar?U(dDCktbsM6f;@asywoMJ&Kd{n$w?G`!MK9xzg;f4n3r0 z+FX7G_4FAP)eo@j&}MZ{GdkNGy?r7@4sy%q^T|iMVTT6>Z)9woZGgB2R37(ue;D?W z`K&wj@l@FM%UySQVub8%v%4+fbDU(el{43Ee8vx7yXVQ1KDjkZ6o{DkeJ%3pW0!jj zE_3Qi%?qYN`^#d3ElQ>ald6NtFTzREw zmTL)KjO%V^ecS~kx^)P4-Mghyx1$G#2UKX%y+k2uU)p#EIcQ%g46o?Zh$={#67v_W zKR3-fKncAw{}V#6pE$0rhbXVVQ*nRzZos{@(MehGz-Yq&t;ex(g6^3(ks7Sq+^a1S zR&gB94QrmWX{~UYW50Bu^(*bszwfQ_xYt4BoNsx%B_CZ@{6y_HaIB(ee@^sxd@czO zLc+;cI|(m4QaZ+R$nG%u&rDAKsOETEw{MFvT<-G{&&qp*r9WDpIanCwY-&P+hwN?> z#hUqIxKdjx-28-}yv+dt5+YnIrDTsPcs&0Sz5YwThcS6q?ZP(H_k|}Q(*hjq{nVA@ zq+pAStV4B}SoDp+S0vNQi&$`125+c&1bcaEF__y_;e@~+e<@dnRQHmQc^Qc_W9o|w z`c#ZavyEc-){*pYH*?l2Cb743_$2Oqxs@~Q`n|ZQQ2AVpCu3cAjW5&;n)5|@lQ6!H(~&#geZX%M`5HHUOTMSLY$Gt8l6y>sAnu2W`nmb>Sd$5$V;^Jpo>R_Z^@ zuygYI{c8Q#LmoF*efX72-#5|xsCf#Q6Rtd6!>ZoY-$?B&?Q^SO&(~nZt37p5zC@C~ z{tjWI4j>F=5IdL`{&!LQT>+*#0UH+eeyr~=`l>u(i|gnbl$L7z<7v5eAk6|KDTfT# z3v+J~@AMrA*?`A?`rWjB{ul0ph}J!_hQi=EMJmg7hl_=M zE?q0wN~~c+wjt!J`DPuYW`{x5!x8@kogDf&?06>>JnFg2|X?gb1t= zhvmAH2|Hz0-Y&2`bkn=7u4L2wJKX*bAFN8sqD{#H_{DXys@Q2_%Z>eSKhJJ|Jx{2o zj{@$}61wX4R$2?cEp|<7`}OOb`6iF$R@^(={ZtgW?yrAVI`ivk7MSw&EmCM!X~1`5 z_*xqyeO_kOHqprFOCqoyCSx8c#%8^xaLLKDEpqZq5A?hLig%R<(*8@;>oTPL zR}_Y7BmQ+j#U1FcfXDylgc9%X|7)ulzCc*W2ZAHK_zc=r{x5prw#<(IiE1;^<1lC0)G& zl}l{!J6b$vdT!hpJ$q%;+2{Qa1fP_@9?zMjs3`nMy7y}vmG7)))Jh06RBd(*x5-gXT zfz%wtXwy2qXXPXQC;it!gpkcf4ViYBJU3Ra0Ck&G9`q%PNs5=ecuyJAX;CeT1P$%< zu3OHE$?eWHms`@074lQ5!eA|oH8ZIfs%>Yf(walWza|!(!x%>8F4ibbeYy*^v;BVc z$2y~J^_VU5xwWb*Jy-Lt&Ssy#UDU*AmQVJJ!&$5DhWrTRtYGb&u?yH$mTi^tUNlg1 zn_Pt$ErPL zVK;0^FzN)ZCJ3I64eF}uD2e$fp@*9-?rLYo{@%7s+f_^twDM+hST!r4E4FSQ-J*kW zis(twO&pIm0ix;K7jH=M6L)jqcBA_3s_W{R`3D=|dcb?@9ZCUUi%%Ub)5k{K4szlQ zEFbDB6#A;v#1%8Y-LKguKvw`0@51%hk2-WQIHYAO-`4*8XUUy~oY{}p8ME&)#7Fhm zpPu?s)+|YxxR?A@8N!5}f8?Bvovvk>q`2K|Tg*QsChYS##yUzmwp#CVv@1e_WMfwk zLioybjmFO{#tc^nO;`^&Sem%pg(*L2tvDPjxBGJLx0=gnjdOgh4Hd1te6C1a=Y2IP zxWeRXdU{2vi$az-(ox~WK!$&U=z>0-un1(wWyLxp=p=~xVxViDa-$&I=eDm#S1gV& zcOKdPhN)EvevucLAVOf8?4;XA`V&*rWE*}dOYumpe!Ls<_M&_zJ0Ja;>5rSSMLyO* z43ZjA)R6nk6}v_q#rcLoCD~p*+Q&q%IN>W_wtV`K4y~Vf`|bvPX-Is)ZRJMTa)-~} ztzvB4TpJx;n)RAHnXExH8L!PdKe8B^1h>ja@II1!4#qwA{Gk)+A8@oeWm9uQkx(oP7Ua(pP+^%R-8#LtEGyS>OA`F27`2}w>5qmXB!w+ z97E|XgtYG(Z6{37mfq{UoHzI3evSC>a@}X?t-p7v${6VJxEAf|YUH^_di3Kf;-Z?u$2XIdxm@o) z+KIMMm>%D5cZ%1c_uX^2Vm09ljZv^$GFNF0GJ0oXGvVXJSEWd~9ikF7-+2jp;|%1; zL`hn1uOPkvbxOZ;@Q3xDk}w6*ZP%JLMjH6mz%(aR(x)-3*hX-7_8kjnkxsZZ-zS6R48%Q z&y*o~Ap`Oqe#Q6|i`^zOMmmjLykOR3DeEB_%B{U>S#9YeV|QypyIkJlbu&MO`#w_l z%m_rg$lE#9b$Kn1SNq6euc7s&&N980m0S%yt_i*}lAxC!dHC?ES+;<-BQYBxAtQJO7E+EG6KkerPtc)tRCeXEF|_;dZT#gVJQL%eU! z`?c%3opRDFS86LK&3$*Z40~>eRS=UGeQ2@*FJ-HJ+LubiAMMUVUplU5lWWQ1%Tp>a zdl1MqS8rcHSWztbrmT(mwB9`GB1|YBM}DZ3?0-(o;CbW1StD8(^J7=bHyKSDZqa!# zYi`f|8u`KBX|+y$F%iLk$v2Z@Me*|2E3AfzZUJ@kpFVv&xht=iixo(fkstc4OD}7C zUi(t9AYPSS4*?E4{-$uT>vWNxA3oj*BwR0AgKjSu`_+*E=cO(x22z(%+r}!7R`#1O z-vsRr;*~;&>m>xb)D!!XVdP-Fa|-Jxlw`7et6dF$LiSFbPA@t`ad(tLme+*Bl#5YP(AiI*w_f<^ob-}sBvMWoRI3L^# zT5Vn}?UH5Bb=lX&dIf&x@GZ6?~oG(ol6q|MP!ebBTyu$N#tx!{Kl-5TPgO?>j(^}FO&oMIYY@b1-+5ML6HoQOtg95@AE6K4!ISM$IiIDVdd&F4J z_E8Lez&=;uS?lbs&AMWb)%+&i9B$kFTkN49H}l{WRpT=*`$8RACVkbOS~J}{GZEy& z{X2~lj@ZkHrkqG>U7^s|yVhSDPn)CD7vV`s9T%Y84M<4a#y>it$EF{7_-&s+ zL`+rNBjVTE6C7OL>^+-l{{vADT!*&c0ev0cIIia7`q&ib)6?bF!O)>M4xD?fAWT+w zQf5{r#q8EI>{1c#X?YONwt`Ev_g_3e+u*eKSkc08fJ>QP0mr!%S{g)b&v7QdmS_Ttg z7E+SRNUQID*(VTa`HkY3OJ#CABXg2PZRf{*_+lQ?9)}PeAQFX(jI{iy<6yFAyW*m= zOF~qipY39p4-mCVi#qd}l2HCGVJTJU#!6Da;GmOVW+PKw?2b|hwq}FJ;_b!rPBz6cp}z@gAr zhMhS}O!uZ)@P4K$5^k{J3HGPkWzIAYt}>k@akDIXj5%kAH@#4IS43w>0)0nCN^|q* zqaQEtDQtYGOuRd<>YW74C?%U0;=20KZ1@VE4SU8wl^jXaW6TpaIGJRV71LOAQ8F{#(C)xAfDOC@v7r~U)nN!0yKDT zF*QzPIH-N6T0T`6oh6%6x|QDq;zkD*v=w_ko4?9CiE;e}H<@^w70p!JtCum-?=;vj zr1F>_cI^zl#__v=cr8nc7Yl!R_ro{e);I)HZ0MDg!eVgtmxDcYJ<+s4iN`}pTO@X7 zjFyc))*C2}-?*UN(osoL7sH^obZV~t#+eyqUSz?y$S2Xq*qxSX2sA=|(2C#C&m$hY zQndnIY?QHf$FKfax_FtgS@fAYWKq+Gg+(_@*Jr&ghVVZk%olg9`D;XFObOMg=1v4m z_yXv9x$$Jls_}y@;|sp;#a1yoEn37+JB-x>G?rE#tyn$RQPvxhT4L>941_$abqhl$ zygIr*yykW;sF!wB{_&BC(kn-!zP@XA{Om`J9Lb-R^JP*8AgNd(51Y093}mjwGpE^WZkk(=e%^#5`%A!SMr*avNS|!sdw?)|9jZ2t9>|;; z+_8;sg<2;0Z?=nNn7%hA;iK3We}J?`s_P75-%hUng_ZAe^31gqy1#-5kPFM=p~0QE z+mbV-TZ5fzGVCrIl7)@!ui}ynJsS7QrOJ-Zq-LycmM$g7z)y~RKUD}oC=YT=`l0@u zu&4cZvrQ&CdW%aLx;Wnft`m+s;SP4cQ8gum_QGX@)gE&RNHj;6^r}u8Y^2BaA^--M z43Y=ZvWRS>Wxp&pqivU*NgoKR3(&>G5JYaeRGjcEZy?(B%aHlp6nT2iBT(z>MYwB} z>2IA?>4uVz0l{NNPt|3a`z{F(3?7@2qnhW)!;3Zj?X^c*vP@YsKqdMv&#lP1$ zhOxiU)_rgBrZwx5*s~5_ZD*`hU2&~C&YFxwUAtE1n;jVE)*RsMcK+AcdTB}TjQ6-s zSY2Q1FXj1b{vmCd*OQ~VYpepM+mD(~hBSa z?ZA7=dFYSIk-GK3C}X@#Iud=;88-A({&JSe!r7x-(Z})Yx;X6JMhImBpLS{^AE$F? z3jOhy()F5WzL^4}XN6K}Q*|XzJej1GE)(r(?h_K}ZEwT9e+*?rbm3?Y4eew&^IE;j zD`~>b<}j16x5AO5n&Jr*{y3;=1?k}tl9luKesoe=#aw*90gq5qkRwdH|yZJFv^aIX77(I}0D;K`W&Ct&4NX6Z<6!GQ7`U>rM^z5D;c65tuE6!HK{Pc&&$9s6X>UF!$BvRrN_8#Z zpzX3?fH!S-cX!jg95xBpcy;covn{0tgg|BZ1VK19qN`>X8<#TMQW|_RcCo~XJjX5ASn1b2ll9Mbl{wL^Wcb7iw#ouJ`(3=S{QLb zK<=1D#JXz=LVrHu37&79o&791N*lMg?FBMu{sT=%chrltGU9l&R zqiv8EsgssEQ4h~S0%iV%Gn(A;&Cn_IK(g4YV6T4o9Qq&e(9%!4O*LhyOf6f4F8b4e zsHCoCu{obaSbkFe_wENwz{XXKb)%A$Tv!|IwV)S4wn>jEX~7C!05o^QU(FL>r1jza~t;7uNHuc(m2r7bFY= zf4B0aJ8+XjyLCc*HTR=GN{5Ch2G1qzS<_0Y)kQ;-3Or8a_G`jYu~w!VWJYmDT%Ate zntkBXt8dEP1w+!WyUfo0K^A>VZPI~*LaZ7A4B*tO8Jv$QFtl*W4VPe}>?CQ(EZ7ev zy=+tzApWaFP1*@Am?`8Y^_^@!$sonOoR`Oh9cz8!lXXPf(RuP1kL^74eF zZLg&Mk{^c&{{Y;1jm)?_==GU(%MWRDj!_BbEw5(T^5bZj;3Q@t3U-#owfMli&bH~M zQ%V$?n&g00xt-$5XX4zB6BrTZT9qs3LdwO3IBa*e>Rzm{OTE#p^)=+S(!Oiv>whb(_c7sdR=2n1Q6abv+p;`daJVBoNRiXZu54n=bvaGXAmE=_p1P z`H5~EFX?B0z^dF``Q-C#VNOi-U(Atb4q%RA=>Y@OnrLXVbFFmZzu71Mv3LJNpBylY z(>mV6*tB9+J>`DRrnzBLTVGmYVfb(~myD3)ot|8h(6Z8W++qP=h3TwHvGFCF7qw_ZXLUcbLUw zlv()~KxR|%E7(goQ+8#f)qKcawk4enaH$S@ayuISzOg^jq>?9ki5Xx$v83}z=>n2IuTaj8>4j|lO z6hE(lXm`(grIF5`clX-`{;A+4Po**_05at0>Eg)fTGl0QqV+VFJJzLEQa}3RPZ`gh z(Q(rmt=yzZ@a;A zTpOjYbCqi(qlEjW^YeNj(4A8?s6Lug>S1bXYHmY=`6kgrO~6y6Yn)9zz3lA8^0<3@ zZ!iW;MpOpzbf@N4(bNr;hqV)tJaeFy||er zFX312?}dLBmEWwDZa$)USN=V`(gH*&dfCfrOB3#?{MP8BnF`omYay z*cLkl&opFr>)OMGw0=*SPQ=LgSFWEEY1H-Y4&Lii?P7A=3pP%fy4!C1FtHq^jeI2% zXt&p1tHt&iAI>`uuwwd&-H#k13OyT6%$bMPD&W}7_ya|{Y~uw*tov_;5PKEMIrEZ} zwUQc1oUsu}IhEDfgNnUB8@^C)w)&&7nV`hYyeGu_!R|0nfK-diNSdg4o{9Tu;T)Z& z6p**#Hz6;cj9a?6ASb_=hPm$fNkU2teJ`9#w49a$eyuP)`f{7Ue|e69oP z$TwGki!MOHgrrgm^#~M#P zi4}%EV(QqO;`TlDvly?oPJ^GoVKv#2$}SXMPL1Gi$Pd8-Zrjj-OgM z*(gtFL*i&%$Im+TuDEOy{R}s99bi zjCILJQ2i%r5L`{mZDm4AooD!U{j8nyau=Y5kZIXGT&2m8^dJujwg@4r>qMmf4d2+o zL{TKenvmhv{XOdiOHtfV0r( z-rHys53)bd##AgI?*|kc`DqjK+yl}q=Td^zlNFd-G@)UltiZK0VT5ZI+{m^d`Ib@8 zm+s#{(sK+c5RWJn+W(a&5H*o#nU~B4ab`P)u1HLq?N1t8%I=PAIUzzm#$g)Tnl3Kh z%o+vVw0EM8>}u?~!zvoYGqeg9GW&*ja<`8P$otpgN^EQ8+wO|~f)^!2AS7gcte*{E z!K!*MW_S_N8)w4=5K}h0)I%%XUz+fS6-1rzWRu=BKJ9Kop4{fggVVgr9 zWh-gO(U{kuRHfJ~d+X(fdvt2%-S8OzK&TVKHv6DQdd3NVF4CBRTjL3ZB>kxG7r8?< zNTcZpq1FU@A+bJskW!T0AOmO^Te+4YmQd`H*QvavqLSmMO*;B)v8KH}y{)1#?y4Rk z4=w!m?BwaX*>>^DOo8%S<-$pR7o0S^WZFhlTD7Ul@te+9Uanhw_i10avI>{!qnU&g zdh>=|bUk^xvUG%yn9$p7g2dD*<$IRTU2-kpMP+w@s;@FcgynoK+iGf3*+=WfPEhaX zqs-5^twe+KpJq17w}iPA1Gf>Ue}>8899^q*`dLI$KmRrI+O*=>gfcG4i}>QnF|3;G zNL{7LWciz)@LEwmj7NBus(%P-Qv8EGyRWV~YEZM6mk-wK6Z)F@9dK z;>V6~KAS{j2uxq`njFJ>9)1RS^cFRnx_P{3f=68!|3QEbFf|*VzTICA+1$``TkZo? z2*4heT4S}^M2|UhRdJ}L8g&l{ULlh}hhv1Z@uQlK?Ro0L z5{p-B0)y}PQ*6j-=&nq@Au0Dg^C$60Ly|A_(6H3_qEs*lgQ&-AGz9&{Q}h|M1hZZRx^tTbyZ~ia3uy+Vh6(>7v;L*h{ zK^04f!hj(fG#$)Uty|i znW~*zdV6kb#-!T!{d5vK7SyQYsG6~gIp-7=3wzPtD}2@ydV~oQta099rWQ1pM0XA_ zHz3iV@}kutCCwx8?8Zf^DDiz>iwmb`ON2-3r)l_%#Oo62wf&O$VRoDEb@T^6FWqUR z;epA4oangLUqeK8>pyhpP#5<@JX7I8`+`#2!+g6^M+SGc7uQkRzhMm(-RyNc5vX;C z-IjmR&p%fA-|*vG=@fCBZ6^@UoH^w8i{E8 zq`cZI$Z#^>=0eS_;d~T(Bj}Qrj+}Mi&%qB?j6lZ*O;6G!M-0FIt{_nRbdiZ6^SkQ-og&`b=y*p8@@)@vIY>^p7qu8Useu%_ z(W`!9F46XN&uIR{>sFD?X1ncfGi1cbGlaq5W#GQ{Fi*6!&pPIXQ%sMVF62SO$u2^V zq$~FrF~5mMe|(Kxx~yY+CSYyCu75O;UbHFsTC7G*~3)@=I2w`jFL`0!Q%9U zP1?G(6p9gm5lU-#Ann3!BYv7RHyU>4uXRQ;TXUHTeV!D{ibvYdoQrK> zC%n-7FY{8U_8*|>-~3Ct2O1Ss_-A!bJ|AcNsW*tL{W;S?gd}T9KZ+KP7ou(RmHnSY zoc|Xv1>iN?&&+TC?YA5N3MMzCo!o(5IKRjhGa;$)io<8_07aPTW1C8ZF@lpyhC^eE zoI7Fy^BE)(PYeAtXWbGPoUE~B4hr~;EN~d{Q*`PkohO351`b4Q&M}JP9Iv_cc##zW z=}>kPP?RWZ;~8U7sbm0F`E`@Uv`x4D3UJ25k^`PZYai4PggTEDl7?pWHdcuZyIT&( z++Op9+!AkIPjDTpRu2da{bMv?mAF@{cCT=J)|dH)WsDz= z3T_DE!-T>~W8)g01&b#9r&e7PXN`|Q{pQrUPZW#KzUL~vw2<>Pd2}j3v@S?K_NMR~ z8=GT;#?ZZEgu9KlN%V_^02@#=DPTYtbUaCE?qmdEZ^3j40XWhin1_ zSiIdYX$%i)a*VrNtVfpO1gOHBv!j0DnaPL5xMZH0NTllBXWTX!*B*=0x#3Jd6Jn|0 z9|KAH9&gLH9-b~Sd_A;Pe684>Tc+8@NafMcAupi$Ml$G^C#RG{w6u(jrnh&^3Wgj4 znlekM6Smjxh(4m!b|>sa(Ji%UH))&e17|-fx8lhu#pZ9u{{>;aG}n#(gJ!)U?VOXT zJn6IC&wG8GEY0d_1APQ)U7uonYDZ6UlNLPO(8=KeivKunI|6DsS+#KHVfG5iXWTNX zZBr&Jq*@V9ppLH>D|IsIJl%hyG@72KSGn4*gF*VmFR!*tfKJ@Hz8|1ysWZ0fr}l>o zDu6A`!$ue9^eq}Rd5oZbEkwh$h59*Og42(wq}jb|*Nq`Z_Wx%+2`^ZF^}IX zfu0Iao%4PQUuvK)2UcNnp~xC>leSYjR|FI#g&wp?W8vN)ON@^-*Q5IjYbd

hkY_ zwfCVLyB~iKYFt^A8AyJ!Qf(T9^31!3Gu=r}sz)o$RK@MYVs(IlzJiU8>HERtQM-mt z5~w@jho9-N$qMI-8I&w2_xfvJNyaTXCUYwYP&3#@L-nlyL-*Q<8V1qTk>7fiAov<1F_RX<#AE;;4FXz+C=vENt< zd>dJqmIhYoNOSM*PC;+_Z=RO`E_=U#D_*Bp$5nyX+yR9d(*y5+0`+J~a^#>vkl>fn zmA|Zw?5)68w@X=wDjEi?q)*vwt70CnS<@#QA0f7tvX44F(d?B}dgGw<;izcuG&hTm z=agIJpS^uqqmRHvzpu>@;|iai61Vqk&K=I?soVuq<=+QRmzR|aa@@12+s(s*^m(Rk?NF(Drc#q zu8Y$u>AJE1KxcGKy?PnnxBqi)yUM7Nq*Y)dJE$ef8dILkOP6+Yu(1}N(26uSC8W;8 zq|Ldb*6gk%C;6k53~%L%%8d2m{@^eh{>S`J$7x1{a9)DTcu!NKE1!vQaY$b60WY*g zEq4ar!zlFPP*bz|%c=tYxkXr~!4H;@Q`8Ozyq7Tfi1z;%51)&h_;b zL!e7l)59R>2_4*UvrPSfAPs?+ED;;S*qQce=S;%o(i>Wxk)6s|+ae+JQj_=KU z;ucC*PM!9SvV>Oxy{0lU*-EQGLoy?BI)0{ix5&I zHMoGF6$I2|#jTs1sno`Psi4~Ct8%116!o}l^5@!7?+H(gc$urp(lIvxQwB*M&U1%# zix7Rbq;7!Rp>)Z3$qVJ_%9}z#lEvnI` z{e`vWQugIL?~|WSNAhCL`6eP$^t!e&kMhUZkH{VJ!!Fdb$tY#jTeND;8QZPoBqb`y zNLY#0*Ao^G_t$7S*X+$qAG^4zzkO2aa;g0Jvxin@Q|u9hxJhB(Tcg8eE#<7y{{*5Q zrE=JhYfO;nEZ;oA*$){D>zk_P2=K8h?V0b(XcA)0`&gTCDW;~UmKR)%%~>PIO*}0fqE`7#uKo}3M}QO|#2tXvTl2B@we)fj zvxe-`{7$)AK^{x`u>_d(F0N)wN?;)+kxHL)7YFW{*|M3dXrokxMU)Jx+gEulp9RPc zsC^b&gBB~nZVa!pds*KtdypKPisn5|naxs3TU?zz=q_b~yxFkhYoOe(s5`klkSOBj zE7*c-7!2+f0pw-oa_Yy{e{uoUzhl+KhKMeE>|PDxGwxYIRH~qo3zdAe(LIt#xCz4O zR#@hWe+{mh3-cCDqGhpzjo521KL!(iHj?r);PRrg`9~Iaeb<_PLn=SZi~58R>`MSv zX4e(gFu|^fDRI4%ll=DO@}1<9WDNpZ=Rx{=-x1jQd*M2TuSJDGLQXT}+z!j*y2kiG zw@OnDg0|3viM_C}?TzK3tXtVF$bUp%kNJ92s}gA6KY$-<)qic$()osX1IY>=QN`7I z&-d-Mw5T$K426=H$XHlKaI&C$tF?CdQdZSX7`I}x(@Muc?B#%-q0GHi<^oj5R+eO< zt@VJ~joZChS)ii6eDZ=*VMl-N!2caY+*CaF7i=l%%>t}5&dU76bKL-8wSO^_&5i$L~&R!D?p3mBi zK@wu7f(91*%;eNvk(`UYW86m;{g+;qlJPfORP$U7+aDDu@QPmz-redG$-A;NA5J#? z4!0_{Z(% zjLXK+%`7iBf--ICV$rU!|FX0+0sa!>Rrg|$^3Mm5AE~Cc!NU!D+2>58oQDbz6kaxI z#^l3*FZ30aTiU%vnlR(|3E*3<18jtm1~)vHRb(lcC01?EU*XqOI92J;^EH2_D>9$# zST{5(z(xCEkAXf2o8A|HJYjcsfw1udy5`?Pzm^9Ye_INTG^}#cMrPSBQS3%1{v_uA zkB%Eak){Lt)6Dvx_AzkWeLdgl%SiiCVx9NrJcAr%vF*yE7z12XSzJawN_+^Gc*1Z>FLF;x3 zwAyva&V#E_!SYdS2C$rxd_c$JI-Usa{Sh|~i{LMjK<@e98aVZkda*K*IQ-KX& z>+Yuak&c8?>+6<^4xOk?aQ&TQ3c1<)Z%i*Wvt|6)%vvdqgjlO^+J zpp0?(KyH&)O)s%Xf%O-)hC;%eC*%*`n<-*}U(F?>$GFaWkL) zDtkUaQHp>G{PeTGG`vaGH8p(sv$!w{{$2U0Lc_(LnmdX&x*vS6hN4oQ#uW~rv5L1? zU15lf)x0R37veQp_*QWd6Mxj3Zd_qC%O`UU(SZ??sGDd5JvK|z(vPHFXEo1L5a}Fu zdoh!rKpA>#L6xxOaYG-@HPH9P4TRv5K4_~eR>{mh<+F?ZxHA{kjJVl)T6^5gk;7?(K{mHT=NR-lLmEB)p6HRKJK zs~2m6F1&zWY3bwbelWBiO|AL3mh>TkTCUoClxY3xRCjqnisXf4B;RUOl+}6QtP1x$ z)7A)gi@*c>P5DaJn_8j*?YVNzVmX_f?S=Lvdm*9T%s1N_$@^z6;HclWB6VQ>i@hkw zTgDjgT6G~sR;zpFNbzI>pnFnP0LsOvB|77czvj|B#!Zi{09EK6y$RY6#|f0%`hg`X zfzW&2QLga*hYMkgkiM8zS5U3*z=txv8hmBNX)~fAMjAG+NpNpi9kE$Vv69f9@qLE z+BK}ZXFla(s0(e83I+hH{hl&HHOR7yH>?Pf4yKG`LqlS;GXAw@rg3i)R`N zjd4#$P;{VqT=*L_yNsn6;EdJ^L(#VQTql&tG*|b_}G|I~?I>(yY(I-1ywp_SjG>tN5gK=xj=8*@oTOkAf9uFty@|AL| z81x>Y`xjb`qbrp+(O#kZ7iCvFDW}V~BuDNarg*@7AM*#f!debit>?iTC%M@U4i+%= zPgSwm?hKesQn&u9>{PWZ!icw4O{X@0RP4-}Tvxq8k55DIw1Nrdn>Xg^#}GuJBnLg< zZLBBXs-I{)7di3zV%p>m-3JHy$#}j(5pi!&Q7>pXVeFwP-RhJ@37F4-*)M$iuxYHi zY(JLo<6=u$?@8`V^3LJ)!G!MoWfc1AE_h|gZ~~(wYE5Y@Hg|XtnXc+GvpKfa2OYob z--IUzH>8vMuP&Sh2|wMoK^S2IuNGv&)%K_96Ur)quY79PyXP*I<8Hn6RsmRK+Bt=A zNJa9`a`m?&_p=-Uah0fmctIn$r)yZAmEtZHFe0<1c#n9BHNc^_i6E)*RcB~T2_Rct zG?A36f@1>%A_DalTAr_Joicq2dbmd5ez>)cY77bZTn=n_eaeIBF8MuBcxZ+jCPnHE zr_W`=GRzHi89egDlqORJV7dei9xLmC4P8zZ{v^Y%J^%6%CQ(`v>j926;sMfm4PGy# zF(L_I1wtgi$J`Kz6`tV8QZcu{&S{8e?PUr+dq%|tHBk#8#$gCfh$S_N`+$_l!>}yK z?)ksA4h6mv&W+_;<;Ly`D~lqYx=*dZ*@aZGm;(=wwGNoKiV8vo8wyoMdku%Z0P_lo z<3>4%tGQ?*@}V-Ax9~9uH!zbV-DS3#lHpt0cU9?Av%4tP*C2LyUJe;jds@f{vu?kK zz?KgZ`j!;wb5;Fg%nAfa2h0Fi95%+gbk1w&wGha%VZc}0v_&!QLJuwoXMNVa^ipf# zBft5+jNRbi?~w1cY+nEBus-2cBhpno`+R0Ik9lRo=cBy>Qq@|&^ck?@a^;q{NR&wV zs2@giGHCny{pW3SVF?T~!V0esU6mI>ynl6W-pfTcsGgD_uj0Dtd(@cae&%dYr` zud-0xBKJArw$spyAdfGbY*4hcfCrP)$$SZKfcy{-;ui%t2BQCW-^(&87M@olDxy9O zIQ8x9rc=j5#WT*Mz`1WmUf5+OHJm+8mzS~*BjYilXVWq@n&7TSarwRBE97q!tpZ(w zw#nr2xu3gy4yg3%gyB>`mE{#kKRTFJ6_bNrzJofEmFa#ARh8SJEM5gJdLD_!rXo+G zu~uy+HFr>AcfAfi0c|n|#FZf73Oj}(pT?6*cdCJ)%z{+>;s!FFLtW&)=HjsO*`($L za`m!mcDnNz9mz+AYTQbhn+m8szuVjc&5pg(fKTfH6e>cd@nlWm^bUP`PkQFI5N8ah4x2M5P_|`{svXs)!M@JXaDVJ1eg+r<2AS!k46P#eVK_>vpJ{w%Z~A<+x5MP zGyBayom5Nb|B9+U{#%p9r}#JP;l1KC7fHJ6ugCqH|NpmS^sh_3c;P_fjqlFcx*Th; z)I{$+-9Ozue-n3)kHt;%ilqIC*pxpNk#V^XELmg<;WO{#R)me5M=PJ)`z(Iw(4A*F zd{saD|2BGElKx+6kW1X-e%Z-B!WUhntydcR(; z$MgKKnyer&pGK~A9MCTn^(l7kBy(Xz|Aq146(7b4vod~=iR0o_UST6_V+O8khUDk9<@53}AZilvDZ)IU@4GJ}?S5 zF&bFTT>fvSeI3UVrvA&furo2pqH-bnvl;u!7hCwyQ^k(6q)B~9CN^rw9F}pwKvMaR z73EBPMqMUF#9RxSb`mh@D@I;5nSNAB>%f+(`cLP+e1|{mb_=~B*DJ($-q_{nUm@J- z92t8)wz>Uyaym8qMw=_~Y2TJ|8SX*8DM0yk;!?n#3M=o(dwXneR&eq2wJi&;23e@3 zO--oCR}SmrGVPBmsk5o$lIE-8$uw6%b_AmLFLgn7w}S5r{!dG{tcbzkkwZ$DV5s7Y zILiOB0dAgmU^DNe(v{%$fkPKAQYeRlS@vblwEn)0K&km~?ne9l_uGn%i zAO0p#0|?nNzUcFJa5thjsLdgA^KpeosQDdcsIp*s%UPcSp{#VrY-(oX-Frq;7usDuZlhZPF*F*hD$qmQuQ=>L>QMs`=B*R7Q+hj#A=xk%pF zi&=O=S{rRmNkOYX5B=6VvwE=eV-DxdR8knkE$Nr{W0z5<7RJK;3Lx|uY(7`8WB~%OYGEimqKI4(Xx+00QB67?l_m(r*Lkxk9ox)U@r9WHt z9~^>cgJ`)r>7tP6`80SCQp6uT98GPC|RA zuQJBzaUj?`1vyjRy!1pxC?_N|A9b|OEtWe4CHWhG!0n_Laqd_LGhBmfYnmCw&iznC z1mNBQ!~?aBs2n5F8%}fHQQ7HJg~rccl@7Zyn|F}y88tdPCZEIK%$2kRXnhm8nlawY zCMwq?sJs{XvrU9mmPo(GiwypG{KXBEzK2Ds~5fSdjZ*)pw;Nw#dQ`w?NC zbc_1ADmo=X<(xJo5itf?5!`j`-EqqDAv0#*%tahr zt+4U*Ot@(Tgxe?BT;q&BuXravU-_wM^jYW>x(O7+WLxUN8*I^In3&X_Yj;6(b- zU;N2u`DQATpkWc0)2tqp@^X`R z8Ta^kmb<*;rSKBZhqqP!nd)5#q^ zv{@ioFsJX_)e-iskM7*M_!3yS`+v5cy&S)ZzV(U9vDuP8J-=UBJaYkx%xR^L9c4jQ z%O{o>9GVtfjwGxxq22yUcQa)CB&kN1dI@KIE#C?KLlHCwLE_0pz8%gfE4bO|Mxxs5 zj(Q+lP!*n=KQsawjL2El&br_L30v{d%KkRsXt3}4Abs0Ye{#X$N=5TFz)zz0W6h@5 z-2>(c|6v!!<=jYbQ*001_9*Zd^V!|5l!H!geb;8Y3XeQOXd_NtQCppH*E02s&{2DI z>b}Fh2;lrK+CfEf9fj3rHq30OtXE&68F_BT%wU7s%;>|(M{nqd(8JPvi{&Gbj@~oD zx(OoEMKSB>vN%D!Jk;1dy3Gp6%LKNt!>`<@g_k8SWu6VC{q~E$Y@5^#`2!lD|0ZBs z{Zq8YzO_Yf^8ByO)ZvPyIGW!1db9BGt@YKVmFrUnW+{#evK!=QQa)9qn!kTOFl(jt z4@vN^L8YWD|M~x~x$oa>bpOUF{4ZGl|6BMrxOF_k2oe#S{)r|cN3VTQBz*i`}Ke!)^|f3y1nJ{?85VX~Zu!aU{D*>0k$F36vCSm;$`%M41)1^^he%vk%x~Y})qys~ zxGb;n?gA`>z!%Lylj&QaOf=EIPUl$sdQ)J7(!)oz5AThZzsEQIumtX%bDab|))*;5 zU=(molG9jy@0oNp8L&Zw_sW6H-vnyA9NvBf$W)~6dmmOgH1{%TSiQPKwqb{@``UJf zm$*Cnbm4D?oAS9>AztVl@_OMGyU12`Di$Vr9JZl#Lu#$bbH zBg18L@VqH4$|dWVpb|6ZKwQNHk2mfUnHT;XY8=kVwkABf*2Z6KS-CE{SqcT-_{*#2 zUGh-0C_-3`gtaau0E{>_>vL~9r^i{WI&XoY(N*aaroWB9TQW17d47vN#wbYtT|4O{ z))Y*;9)uqfy)x$1z|iAJ|0eEO!$Gotf`-Ke-cCZ!g^Gv*44tIjz}_py7$-N;xC z`!;sYTIy|k?>{}M;?R(qCvjaHS`ScE7Ff#GjbQ0d%fh(4!IAemg~&kx5t!MNx-Xp=_j~~b!>WeP z7CYU$)_vAQMf3POmI(t;tE5isk$stEvu{DGswAmVM0p#!$RKCVV+mFXsPN~C(n9tsQ1perhLo&8FKj z!nHwscr6bfW?dmMSJ66xyRfYh33?3PKe+0QJeLJRTp|aV^=;ozva;6#-eOy`O?*3! zh61V{VOuqqetz0;-**5NpwxqCdOfU=+}(TQKUS!W!JeyjckeaGo6xvdaG`P;9$H?k z`$6%XtRgShMpoWd4c)4+H*immU0Er;mAg z`-0-gA2&ds$n2XPaO5;?SFqc$C-cC~zaL%)8!rQ=R@CzV<5{X3y^Ifkg!=6Xq4jBRw?b=trl})RL8sj|6$l;+Gg!>c5fexBDBPPeqZ#xL36G< z^CWADe=Vy#LOH^5TG5%3W5K0MSQ0!W-flZg3|JNj1} z5S2;ZM9qz=fwye{q_z-Ru3oY{wMpCK(m+>3P$gBjP^byKV^Qkx`IOlK#q+Xzc}qeX zf9T_*=5kPikY(B~ySq2uDhC+g#GWr#{}=5D%H04vKw5e5%P8`1QVRZwt?6*y4?+_o zI2y(lemq;L+mUk6dO8IF`BYog#ffmw%Spg6_oMzEZb`g+IpyU(p<0V)y7q2}L&9#0 zN5b}E-53c8i1+OV4!P9SPF0pei7N0xq?U-aaHRvfu6qwHeiJpTZp`^?u{CRNQ4|~B z;%2$})0$yM-;mw@NN{6Y`mNfsGS-g<)(zCged9=HSd)^n4iY!Y45p}JsUFo&9@qr` z*y(MSoPjSJdl5UR>OE`@1ly-2(g2&9 z&EM(kI|4qg$L?L{w+0ouUH)*&FIXu;8-Y-{qeYw&8tYmEkMU}a0hBP#B$kJxYR$94 z#_BJ`{!~gF!rF82)dczu!G{tFNFrL-uj_Pj1c~%>>XA5)Dqv;;!XuGjd(`PG*K z+JqL|_fSmR#2MkH$84o-SPPQ*bf^0SV;+6R%{9R%a0W~T5uU5*y8PW5PWaQ<^mT14 z&b{CSfy;w6bXQ|(+Ir?@o?8c3TNu8vjH85I1f&F1O!COEa3=QV{9V6g+?Ofm=9@7^ zk!$GgrIYV!0#!99^4r)^cjI~x%OHM}OYFC41GH*aDI7-}B%9=*9P@h6gR!P}a$|9% zjJC1Su7$jjE>iS^rEHD1ZX{ntu-1B2%!q_Eot;5W!RYfe0`$s<2S`Fg#ie# zUaNEknjw5HryW*sDw*MnbJL6RZ7cPJ7qZ7%Jv|$ZA6dAMnirzh_VM4g5IE0aEY?&6iAI)_vF)e5tE18D zy27J?t>~FKWt2^*yXr};cQ=`X5vH#qsM*q_| z#o{E=@{gj|Lxdb$i?6e*XF|=bxA2Or^Dfj@B4^&+cWROE%MbIju<#G6@cX78BA?A)ezg7BI z1N(yk4OBEIn=_K}xO`V0$B#Hngw~`+I0p*sV30|qX;91Sp@M4!O3ll8P7b@)vxSVo zoAsGYmyPRVn=QmM2btntaP*ne&b-)_NH3p?P9%ED9@a#{WGdx2;P0Bu8wa_M)sLOQ z5vGFE%>tGe#sUWe%OM!2TPrIIU1W`YCLmgj$9SGNe+m^nnT?oE)(PAmW8bVqGYDP& z9r!{N_;5$d-82j6YzonYkBOtngf>Jy^6zyQ28c7NL(zwfe~*Rq{9XYK zz#|g>gpU}|XC68fu&*D+NTRD6Xv@hQXDHBRO%N&U=KG313Vmsn8W#;nFxxI)Q<}a# zyUndO&k(2?k_Bte7JtrZ3}3W);}T|pN}NEx4`-etgbDEO)17_i9gIQ;b^A!kuy5LE zs#jx;Sp|_eUeH6dPu5dVruSx?2epXToUv|<=ki}WMhWtkwg5K@Xi0NPgpwh>bb%b{ zmBFS;;I9xzL^V2!U44yUXW#igN8edlZWC|kv<$)QTCQ_&n!@e~TDo)sY1Sn<<;#P@ z$D((YE`Y^JvVdZQA64aVvJ2`bQE^|eNgK7nvo#!x*BKIN+Z44nGAT=fA7qTSix7f@-Di7BGk^TgpYAy zJ4?;?BXa;EzKFa#8q`A^l8Io+5A)7F_9S1YuBAJJ4@V!VM^|NXUI&zGwIFMI-q1z& zK-Rkro=HGl#cg>r)U-@eatdUv=fjNPgQ0grPWE>2g`DWv9wfSblOKmH<3q-v$m`Vg zfV>G_T8`g?zszPH5>*=(H?E?RRbX0>ktdmi`7-BAN}Jd^hJD-RKQ0sd#FN-* z+vb`s1G)qhi4)gA3Xi)*oY9jhjCp-#3A45CUCD)N8B`icb# zCZNDJPyh0O>jyITS?16rpMOUmLVrgJvGr6UN4hN5&Klu)Hh2kBKrByo8}NC@c&&z(C*@=zZ7_}n?% z>~rUEUJ&4ecSM9$dcZ&DpFNhle=e_^;TQPff`zn_^tp3|kwk}2@W9WP9OQMLojZ5A z5&P$St3Ar}+_|Q=524Z;?gmTa7mYMzjxVmQmTrF4y-)Glfa(g>c`8a9)cg04_r_97 zo+%T_TdP?XcZfKI-y(4cIS_~!gd7CgKVQfPy)<|%FFbl||D(&?`q~$}-)~xX0-|r( z7vHMFpkH>47Oi!^9Y)JR z*~R|v2P7&O@$?g`oezWlL3m3^nS_6%oKt9Pv!L#DF)z(@9Mz)P!JJ5D$LPNO$=B#v zGq0}&2O;*%;pK+w6@h^=R#2Mdnt)XWhRP%V`ML64co6A4m0wqfoW#q+LmUrrN2&2o ze0)4BK?vo$XA`Liz2LPoCdR-?qufLlmlR%Gy7X8Msi36Pk{O*6Tedo!%lOJFm38D~ zXH7}FWSiF|TfW39BTc$`hJug-&!dWiwcarM5ZuvH zc9G~=m6HQa@7i6F&4mf+?dBbp8fG#A`_V#rsU9!gh)I`@+j|~RXXCS2GKD~fDC&(p z1no!UOCGV`xkHwHG87gN^vq$oz<;R9mge!B=-0yqOs?k3GOGsq-H z|Cc45gsO7d{w7VBCH8qq209kTMmlX;P_w%4=n?Q%D$;kUGDiL#ZLE)G+(Gyth4oW^G zgp}hB-5775W9}6)+Lx#39HnDfnBE!K4!3R|iZl6dv~O@cAk0Ub{OnlbAd8Yd!{_*5 zZ7&WJMOf=M8mj;Criv+aR=tCm^S`$cZ9ySnmQ5=)(=N9nLYc^-)c%|xQ2td&N#^lZ z26B6+aq%$|JA|L`Y^71$Vo`y2Cm`BH&k&{)YWhItzQzEDIGtMDc6s>&2*l(pv+z?X z7>O8yktyPtb5tYy??DjP`Z5O+Tha$sV-alxe}T$WZcd8(>)U1R{RPQjL|ENf&?rL1 z`m>9W91q~fHJV_u8vex6rXmXNhF%!jheBKa#vMQX0!AQh{nq{w2ps>jQG2-i?89-6 zWsVRCn!kgiVyfL7G-R#w-|3*6`raje$k@sQ*(?Pwy>WK3D6&6CBfxBIWfToaLUR>7 z^h$&O-2ya3D;a%HJwIT7!Q^R#K=C7}tnq2eG^K`2QHt-je?rwV4)At3s{N1aK!r8( zBw}n~am6YiAkOl{jhT&*CKNAENoNtb??1C0{4>6Hw;zPa-*duW)9!$PymIy#b0pq7 ze$E9_!LV(msYA987PmM^OBC6@+uvZcp|? zf$wlc==uLFbMyZKX-DeCrtK4Yk&67n!)1Eu~4Rg+CgZoVFY;h zANKH*E)=&dX10%^#)d`84NGS|Ss?uQXJ?6i9jv+>;Je)azs>!1QyvXdSs8^;0W;25 zHEq~A2t13O;I5zlKUgOF9_izssrj*#}Pp!R`MRlw#97l}yf^@M?4Om@{NUAlfvbnu4NI z@MFpU!Z=XRVu4844Xg{U|DK?6JC>?6CHG7an#+E!LT#+o&j+Fo`6vS~0YbkJZLK0x z`SLSyD}a>-qb+wE&$-U-`_(_fO9nI#qoVy=jK^YwslOZ)N-bOkS5}*&82;*h%nIQT zX>Kj+jIiJFzPm|%dpi~P({pD=1NYlRX{n6QJx4y6&E8;jv~lZDkZOY58N-ttPN*#>*=mvf(Q?0A*L zQk#avxyRltk*|&YU7l;Fa^2NSbCH4!CW)WjIz7fuq|)=HX=v2*eM)B$Q+-@dr%cDi zy!+y#lHU72aX$GS;@Cu7tt2FI3Y`+XX4!elEb!2B=BnZn=Yu}YQ^ezfJU%`9^zuM% z;%m{32iM0vL@p2aVJzp+Q8NgI0{G^r51K|?cbrKcyEgHs1*c5lL5@2|FPh#;-FBHV z4Lk5=bW)=@;&pm56AC}6Sl1pXlNym>WCbI<&`@Ja7SdgL6{W8guAwd|Ax+mq&c$1M zPM~I&sT&fv#LRuOi6~#C`CVm7i=04>f9q_pm@HI-sWT@wt`(1+L0-ceo?cKG(3$f% z>Gm@dFj9b{SZw{!-X+mH>JDhuAuu$4^C8kgk3ajOR^8sBPk}ZU{Qa-cic-v?f0O!V zr`2)qeec7)vZ2b8D=SsM9s$;8blhO5nr%+^+08ao#oQ+-FBxuf&@Dk`Vhb0F*^!27 zP-Cyo&813(49CtlW_JDB^z5yIl_5fgt~h`6)3z|w?iDZY&^HNhUS{(!Kfe)A(oayp zYE24QFiXIwEXe|dT?88}Y96Pl4oRWl%VbnspV)h~+u9!I@ng*S;)1#v3VOC?gT-`H zCKV$A^verDIceZ>F&9t4NbJdwaGqR4C#l(}&%m7`Va8P7@P7Ax&oomd{skHf<>DQt zdr0aeS$b9o2k9y1JKrz(&fq^Asxoyaw~UVORy`m()sQvxO{wY!y6D*tWCPq)>9 z%g<^I!s1#~8VwWOkS-k=YOM%FdjLHb?3!HxbiIK@uk(xp<}l^Z4>XLHvgUZXHIBZI zq6^np7SR?f!09&I$u?NB+48zp*eZagNiNT*Dn=1}c^GVUeIGor4j;&LQ0R|2u6r{< z6h?;YL{mp{sA8kj?(Ms|qxBve$!n!nxw0zP;0{^y4{$wthxU)v zAjtOCQ|f@0ur}Z!XRo=`>h{cRQY$+mM+GXG-xd-gl!=Te$tU?oxL{rel_V+V>>qyq z&c?I_D}0B}eOB+)v{vaDA3O$Z<}69TbW3Au>q__CuHC|0d4`YfrAX`y33-WXd(%#? zoB9_VQE;cjHSVQJ9}>eZxYf%|J(LtPda!W&-`%6xv{Ij`InOe#30r@VmH&AE$XaC@ zwmenS78^TPj{3(s&?FrnaOJ(-7|3f5HMV`8{wlg56+;vwZLVt2Ke=-;HT^D=GZ6s3 zKhH|i5!JZ%X%T@W;?-iMl)jMQ$bn_uXG};f#=@}X>VS{ev_uxwSapwD8f9`@P)p|R zvVT+tmN2_GT)?YKoI08=`}19uWbQdhodbovePatF2qf(+_|acrAGMBnpKr7ju9*qO z`iSAv>MjhcN^mal&Mjpx=KdpNusg95!}6ZxTDJa?Vhdp5T_W0ZFSsaH+ejS3qb!vFV7b z8_$&Trd8@EXnu49+U)XpR&|m2U*hLTHtF@&q0tv+Sh15ZOPrDG32!k7rT$CJ)Y2dA z+R)`BiZmoVgNCc>1$Uy_5baB-pKojbM#N2ZWCN6k5;@1Kf$PZ~H*JnMlXRRkYGHSiJY`Y8u5ury^4|_taPQD1C3c)miEQ(!nGu`u! ztWWk$hX(U55Jof%zK;^SPsQn8{nUuW%5hTa$13x@=wH5~Hl}0adX1Z;n@j}f%Pp(7 z90^0sTIO=qNY;Jy_Ri68-(#jYFre2?=jW!K4gDbNlWq1Qedp=9qKtORB&u+&e?%^X zXtCQ?{?L^0uB{k~jQ>SvSUa*Phws1DWNh9wfoC86<7Uk?2le85YWoziEsT{RH?2R+W%Xfg8WdiM;<+=Tr1Nk zd)~E@j?F~p)5VrlHW*t50H-tVq!vFskoUr8Sfyy0&LJu%i*W7fNps=P{{o=B>9~XZ zdgl@4glE!kT_L<8WZ}Sn%#=eSm2`#4;{95S__VD@5#2{LG*5&oUM(MebhP&Q?HO;u zj(9}Jo5X;o;MwV28}!nF6(&{)CP8iJ;3|mXT`Xc61W^B{Zi?t|5@b&K!9D%?&`Rh& z3})#nSXy)J|8H?XOQ|}ZjMm^yyrOkT^{JstM{|lQ(A_ND0+V~j#gRpR;-uSw8zovq z-W960s4!IR%hk(z12qhQ7!mYHf*e5kNap6svx^OPeq44>5RCIdU*`Iqn!-r37&MqJ z$^-w$K%2QP?y#%uFiAqIo*c^F5~5=il8QRJk2f1sF0*0r_=%ZK#_>R zGPR5)b)g@90@>?;Sf7^YP-mQ?rSALR#Al60H`%>Q7I7{GFA%`|Jk14tu_$mFw95Ck zK_(dAaCV26sIi(-J82mfHrg{-&P;EE5F;d?2yUI{g%2Oh7!(8_^%}NdK z^G||8_cP{+Ord7WtJeM&^0k!MNW*F=h_;u9^tUR4BA!?3stjjPylf9^AEDPZ{G)Jy zf{wN|Q~k+!!R6Q_*0qfEY&EeDyHN9fLqTj#BR;L&+pcr*#7&@d=w$}S@eRWhJX4jg zLRir>hDlZ~t$!20Db7HbAIp;;K}E}AZtrt`HKolJTr;W%9Z2ZaQs`Jp<8kig#n~X63R^wsAT3@y;ZiX zUsQT9kRnSfKH>PU7DLhft{9hpCwVC%J=EydT|z@yb(fdQ&Q8@?ow4}2Qd?Nn16w1X zmT5?z2B>w@BIG3x2|a+ar@;AWn;Dq!m=}O-s!h{jjUPLwrtf$lhRl zOV;gwnt+DV&_E8fA(Od2t`{19C6J=xt2@o#vKfufCATq$|47DU<$0!(m(U~1hj}95 z>QS_s{~oHL>BqxM%vAv|?+BPp>el+_1V5rqA|vT}9b4D~ri=J=&QKhjPW6K~DY`l3 z(C3jR(Tj>li%MqPk{5Oq{PYC>qBe@KC|~E&=qr~s4C<2B2@&RI@A|JhYK77HDJ
IwtHB}y5oH{Wng(~#Vv)7je|L!cx%s8Q_uh`WHm*y z945&(q1N_ZIJ~uY^tXwKLQQqUC_o_zi@-CiFSL~8@{sE%P`it=LFxI?rSmL0&P{z_ zs1o$0OQGLkF-Tpz6+une{<~pOo$p6p<~q>0r`Fy3bAu7a773kJJ$B0uR)eKKUAHZ{ z4A+C`fX<43QDvZ0;k6?({Z4$!_M6^zu<;FkeZ@i6;Y6sl#A{>k42`bXC zYW(={g_#~D@dEc@x=rz7`U#I;%mt&iL!X2c#uE@XT{9B!$BK2mu!^*=ZiO_F_F%Re zD6r6tP7YG!lZ+P+5~$YL($8f*T>#H;Azugs=NCs)!9sm?A{J!^bJeZA`4`2#*SA#K zI(Y9TYAO|;l`H2%n_ryOD~4y6A<(qOD7NaC5kGkR##chDH|2K=t5XsJf#N6xaRFA0 zgWE!eEMkTe?kZT$v*BoVQqNkx`9jiiyy%v&-9)g@0cqnbnMgm|by1B_#7#-XVsluI z8kIE+4-0fibW6<=yH4f)cF;kXZ;_G)UxPqe;oIj_*22gq0b-V3P6B$OQ^a>j${)%e z&DdCNPvFX%);)qcnxdc9fs_J^GMV5S@;lNoNO^@qQd!8AkovoRayp!aR4xo|yxo2$ z_Dr%SsrCX9_0bEvWZg8cGS_z-TgDaZ-$1$3p2JC|R;B$(3M%dQ3Z2?hqxVuS)kkX! z$FBSb$T`?<7)4Uy!q5!xs#^nzo<49?7KY!8PTg=k-?(`Ab!>tj1By^HJJ6ql z3`U@gdjrJbxow$^=@k!rP-UUWknND)NXWZ>$$hsyZVOO>$Y%_bl%F+pFa&_C@-c+g z#<}!AcpB(I`O^RBDoJ?^1T1H;sHXL z(DY-9B()Is(b;+~XifIJ>XQ4fSh(nH?_&Q=m>k9@Aibe$v0ybh6UOWMFjqY7c*{>q z0+fW!LfNfR6MHyejAPPUhucr;gQEFX()P4HR5#0JB9n*(sqK-Y{JjtlsqN4w0PNFg zPSRDjsFa-;>VLwmO%GN?NIMeRM2~GOs|W#^c&Ec1j)`8!k)^iKWO zvl|f0NfM$}*9r}fjxef!)-&_#68zjjQ>eDE(Us508~cXe8=7d?O~#(h7UtOded?1+ z2GzY~fh}IQ{BR(hu!ZA$4)4liO$>xzd2Q>lNj@eSRG4)dea`~~av<71oBs5#-)czH zJmD21%;d%ku1}Qk)N^Q1yX*Qo9u`7jJ_7_0i>!) zL5Er3LU!e(H$FA@5M}P`M zb?b3Q-{2^aWN*t9c<##c^j5Lt}{*pTR+)Jx%7N>Ebv(@jqHE{aMX5!7hZ3_rxgsF=wF3vUb4FK#~27zWX9fs&)l7ITf1kZ%guUFPQZ+xO0bM-@4 zUsI5&Imf<8lRundyxw^X-LBrEYfC_VCXZK(A`6$}27`;wCsP3b>1@39S*d7yw0g%# zJ{5tuNtazNsFbD%VjvLA$Hd{_!>zc}ZW3Qt0_Fo~mHP&?wrABMseOQM28FVL?0DXK zn+Fs+LD?sPR&P-W8y3*H{k|bH)hUq$fD9nA_yBLqr9|1a)pdL6k>ZrzHasmd!9?HLpFL z2nN|1ul&ZEL-yev44Tav@{G__-eqq2ZyB3>#l|l$AQMJk?zOGS3jM)K{=rjL@TX@M zU%%cz_L*UR;1lmYmy4HW(J=1zoQ{oOAciWu+`vlv))pYnXYIN^C$4uyt&x)r2_+B0 zuf!*HhM?gZ_5WM>iLS0{RMzYNxja*3(s=&P%CCNsK)_kc90i8i`D@>Yp)@-rd;<2( zBtWTgUHw9KP~`U+;ATiGD3ITl;of(#N>4KxrFIHKAUbc&-(z`ImiW>0p1v28FTJOf zE3Y1&eIxNTHW(v|7~gJjdeK{FM$9*+L4d1jrx9vp>%5bNP8Sz&;T)heow64>%#%eZ zbNSn3)n@XiKXwonLh*)P@kj!i8%9$5XsJWyr-(b(4R6tl)%X)Hv8fD>f(hskfZmaT zOdyX=YV2qIf#OK#hM7a^o1KFeqgRqGLj;i0!@a+;N`l660H%XyDKxOSWg5g=!#59m z6YL@z1D*yKB9IG;H}b4PrG*C5yxLs_9lmF&T`PtuVIX2;d&TMXwBMXIE9?L>Hw9(@ zNC3x+zKM!qSXO+qc#A^6t#=;_6b{B@M4YofX%gZ7bwMMEmV4YqBT`qO z8VK<-3BY!Pj?Edae7)OkT~HpfmdYJDo%6wV&b{TK&~1yd8itoYo4)tFZux+%H1PRB z2)g1PTegUxGY_WcW3a4Fv28xUq+79EMVvx1NHY7!v7<%DwXNjTpbp92VEOOTTb-8P zV1%O-1xU(WvNNs$KOrl#g4N7@ss8?%% zZ&M}_jC4jkewBbVZ_&9YM*GR{OQF^fLrll?|v^bJiF*!vVbCk5T+M> z;O9iITgY{co;Km$nbSF>_En-&Ru(?YJJQ`Ii@Ky;W-kp&t1qS2Q*1>ykmLv`sy(D7 zEfy;$f%`lBo!B=AhpE|5#0F*>Bq18^8(c|2pv0oZkCi=%QetxfD~gF|e@$&6YXa64 zOtMcAm%1k57(&;d9WGY#Eh^6N`Tb0F(EU`~G8wHPnFwofi1FvrERe4#U0ZL$xA~^{4#_(ND&f()J(eY+nip(Rq!Z@c;KMP> z4w`}kpsC9MF1ga1n>=Rp#SjX>37z#4Dk?XLV`!B?cMbXNw*_=otsL78G)(~3l3cW3 zI2t9BS_wuA#$2jI|3tS)wKSB6iNhMg5ot;LO&iQ8pJFr?))zXq51g)h3!X>m=!RcW z^Jp_gorq;Q0y|jPV-VngB3~awDepR5_9Ur#xI(~XM!k*Xlm<=gt{IVl;tr^c(VigtDH!%knF3;GBZO9(6z2 zwZcgEuJu|yM;K@FgT_C4kZdggPJl(B631@dMJ>sgS@Ceh^mUFyVgjF8c<22udZ>6V{T2 zI=KF)l1FRHZA6LA*$&b8vQ`@J^OC_nf)RD1KA{8EI4)N3MiQEh9@13NDxFrLEJ#9% z6x#4Y5Yt~~C^&y+e*lgy$JPfYjp&lZetA?kBmBvjz|K|ATueZPj03fYM*eTpekIv$ ze;f>}04h1<`Gb*-;Ld5N$6ecOzaW6*00P6`QJkCz(y%i3(A~QH@x~}D-~0D~ZIhu& zZH8MD=iR8QB>6{=9IIsGTrKx@bTQ{Z%Mx8045g^Zn87Gj$dOBOP zXP!F<1)N50uJEMvLW-vC^wUvkFnM!C-4TSn3s}Db+ItMFBZRT>fo)cf{=C6CkUEQ$ zp!k=*guox6I$W{Hk5%GOV+=$!FHP2W3QU((S)rzPV;Ex&fvp5fdXx(Gv2ft|zrn$y z=Li&J_pY6eS7PaF`dfSaiS3ePv?_Wrw*|4blQGD6g`-8zLINKRqQibP5r>4~0Dds_g0728qykaB`yhv(eWAuk}t&^>X^# zWNLU$od*mnJedtzp_NDEFd=?gG(Ylq$x_iL_aPAK03iIZ1H_1`qh^y`M?6T6WapQU zX(9rRw4{ApjqvMjIbz$#=5Bz81=`u|LTGSo<2q)(;HZWYv~-?-*ARLHs_8a_X9=li ztVTW%lJ|F8|AAC99G0iX1_|K}Exu>}@ngNmZ1UQGJ!&TyW%XO8Bdi4uX68vk=R(q+ z!gn~VkhS3E;KVj`y+r}AKi7erWchdNA8aa{w$zthhi@R!O6Cua2IaRbF(eqA?$=9}71%l9>%s?TrDO(d8i?%O zEUZ&2j<4w0Kx=s(6Vm?LQzm!IjIp108SA3~(PFd(oRU;3=c(s_TM|PCfpBoQE~yw9 z3NptwUYxe=eH2cvwnhGy1vdPR-0i&NndJ8*K`2|nPd^AG^DnpRkSTlzURbq)gR_jN z8&KkI!H5Ul^hiRSO*S@N{+mX-oB0!I?Nzhue(wV^Y+#7O=UY*6z1R%k8efAMx znCyVhr{G(Lh-hPyPv zuPeDTPI!V|G8xSJI1Ct=77Rn-6E)?Rfh9|gYlNBQmJxh&)^Z4W`kE~?MsYDU6Os*!UY2Pp+Hxae>b&AG)Zh3HFfR2%= zS{ghMOjvytzv@5wc^$EHvw2IMLNbx#fLJ5A6Iv^Yu0pJsVkNDt*o1$O;JE( zE56hn){?fHt5-}`S+p~KEY~{ScZW#ryOW9B08@XKJ}z}AUN~L9jIv$Lf2TRO)Ir1g zWC6OI?|V(*KGd8F0CdUE%k$r-U!1p?3S@7Au+;kpN*!H}Y|W0i4~C_g0yNH$6wfQ? zos?&`2tmV5G{#w!(MW_2~hPpG*g70Z*60|)P|iX|_4v-Sjpq9T6Whj_JhUT>~) zh6DF|`74Zk2ieSX);adnJaRjEw!PK5;+=b*dX>Z*I&i^I(9X{?X@>wfuv;#;!EpW~o(L zwn21AVbBUwpl)6z@y^Mz6|hL4u&2!%xtN{mQ$cWqAKCnpbpAak@V=HzTIvu13owZ6 zpHO4nTKrhtgDG1KoJiv$&?Cj3>^2moHT$rX=eMs%TqIL}9vw|?wro+dU;>4%YyLk| zW+X+B&zJk6=03!rhYfH*8Okq!#!E(C&XI;46=5xgh`+qx&KCH!a|)A1g9m7dr;LG` zus66FDHC5$tL7(;db@`ejTEZ^CHndDb!t%47yJ4e<2&A$FJhS{F4P(Fdw;V8Rpbsc z%#*_`@-v=R5kMIU8GkO7upYkf%Xil!9V_pPAD|7VyKL~?%2Kd*WK%!fOa3SetAMux zDyLg!6#}SSV-Wl+hp>*A!y18zHUy$xoV%|yup+QukMVnLL(v#BH@x6|GG6*H{0i9~o|*jY<5QGq+ra5#1OMrd4ECS&S;p7-dT^Ll@%9rA{Hypo3 z7Ihv=+k1ZL!&Kb1zHR`IFG8`|bN4nmdlf#mR835|0QxQq1Y~LjmczYgtWW?P)V8!V z*%esk;^i(t2WS9N-qTz|5a%>Q%JTimkS&jp6II+qD%GN7z<9oJ6OdoC6Oea9d1~qM zBL$VzckeyU=TxO`6{K2DbfKF%eaVIFw?^p!@)3c;e0BIIfpMUYT?( z5V2S9J5Pl&H6R+7xaISJa>L;6Mmyyu{axw@ymt}-_eAagE;pOYRXfGyE=$M(6xvF| z{eVsZbp5d4D#fiI2vmSrahy>2W(CNkFgcL!qBf7+hh2B>Qq+8;1U547c2~U-t_shpVf=k zMRdqZ>VPDzkvyb%jC1`u(y|TXkm?%?Fo}acRg1Hp_OYK-6)?7fmbu1((N+=Bf$?;X zWBGJKXh`PyP^Ure#(r+VvNxbfxzY-C9{ zz;Yx@E`QlILZSeX8o0viqWlD!9`*iXV7N)0DqEr?cOe@u zP}$jL^nUnTm6cU>KK6AEJ&ZKTQc7fIs?JNPv8<~Ot&CSO!fXpe^(TKC0JPFVhu9fKeVz2-7Z!f3j? zumIk*6LIXrYBAl4mk1->UC-=@D*8cF&O?iqy_qmI4>#XP*!@1=i`Ei4Q`1)^V1Ct! zwS;8~Quco}!^)z5dB8WOXwmGinX$t0PmZ@S7?o@^yInW zTR~nKV$z+az2+;-<2`*z(Q}VuDmrk8$|dZk@X=^3899oun&0rvsxv*esQZCMsrfta%LT#$7luEnLxUEOY^v2yn7RZ_- zcMldwHSjDxC`7=-kHajJ2-SgJ2TZwtB;h5MEJEFNIc0ph(N7uN6x?riO#J%!kVD@Y z9nqSottbBaW(Ssst7s~#zI|LYOxU{Qo=5_wqe!EJp=Qs!+Uk7#_FWrSOp|WV=jlzB z4$RHKQN4y|R--KeXcHB)_W?q==&x?gX zj-F?^NQW-;owu7JW&AsYh}Hy{g!Yl@ma$g81rb)5y{fK)Yd3sXV7n_uv!}USXR*L( zz*JG+M}s~q_?V{R_ImS|t2<-vT#KX3%g6|_X7gR8Oq6C_0sZIt>D$|WqnioAmKe28 z76Ku=mMa*J#K<`PO!$XRO4LZHeaoB3%EpTd^6xort8(iEbPD$8CFXdZKS_2kv?6O( zIZ+q^XOc1$RVhK!oOr57MG}1Km;~wQQAyWn7gL;RpII|^{G;sszXHC&K*oJ{AY=PO zeW+7;Y{DwftLHQc{(d%E#Uir*uJ%~?V@9As$`^Sr*EwVPo*mUpUk>wphBVm@d-2--4NqnuqX(E6Piu-=}-Q>kmJ+5CnW+X#RI z8DsCz;cp|Lyr6(UV4kb#b?E*V>)eogt~31=5$~}u$#_)kv2i@A* zX9G5!ghm0c=L^ha>62gM%vlBxrBRdXk)cLgpJ)I>1Ua9Nr2OlzrUjQo)8#U+)%w?x zKoVHrdo4QqKvWWAcpLvBD1>4jTu5f4Vi_r~C8!6+Z4+#Ps_qkH;3WLFsRiHUv?07r z*2|~uW>Fva?q=pOIpD^QEI54j%K1KQz*C>M0_?`;GKP(^g{6T@-2!b=I2khhAoR)@c2`WsB~XEenHnBbkY^~pVB^&SxgI7NML-2C z#lR1LY6JsZ<86qwS_YaaOshCYMh&P!Sph>lXl*j9Y}sR0E=(;^~MS zV*~OLQadC`{@uRhk^=RYnzARaa=>N2Z=km_;M@WqPrXmZAx1Dgv@8(QLM>JEg~6sJ zIRoGGZn!sqVxeC~STo`>C7G5F@-UhZX(rd93&@8F_{h@!Xr&enF`X-Kiia8D+2IE! zLQ_uleYxylV zY7Ikb=@JDWpg4$v370x>Qd_4qQuEa*Gyv*#p{adGeLC%Gdyy2UVfbU6;o1|rOF;LF zp}$3@CN#UX_Z_b8k*kqQru&D)6`Q;K%3myS7!a+XC_q_s9;aAq{(8C5`3|pqYcmo? zdbQ7oW5Q?20+Ys{HFzaZWa7MiaV*fsP;{^B&9q|$f1EZJis>EVF6U~H%0E(sTIBLt zAt}DR;VnaJ*5!x{w!5|e+67_^1na`0WC*NO*V_GNjRG@<3EGWb-O$ls)IE8(2oFYZ zyu0OwyCSr9oahHp>mkWR=MB$PAk7l#d(n9ThKY6iM0?owUhz(mkd4)-PmKXe3UN-@u#grOLe1R1T+c-f|gt$x^T(1(pJn^IIItf|)Yn57AtHY3O z;I+e=8GpWfGXD9}#6F3t;CH%muPsdg8s_A0$zmMM?_$*tto@n&sjsj3T)*X#WLP^ zH;Tn9j>)mS9c*Nc#je`dv(Ol`-)S9bDg9FWJ@#msm@lQ~cyA_rsV4e0K%6)u5wVQn z=|DRs?laZQtXe0~|8{fS~;JTVAYHR1`z1irkE%k63A$45to|992I(S7*oRnIg_Y zcR1X22t%NL1Mgi(^b=Co0Jio!N7Y@d8lfBsb8AoH2^|asWC1)UYai08fh=qoy!aSw z@dC|JY!w7rDb&{gFxa?hr4C9p-Ge!BaBZ=^Ix;7rI4ndM+4K2cM5?RfGw*j2gu!2g zhciP=MoVddk_99sqP&h*SnQ;TlA@>U0V+RnpH0DaP~gc_a5^Nze*vG))JmPkG`^cV zjYz7Lektjac*rw;!v}fWRTR<-G zRci(5xwcj?cmc1*LoE#AA#Qj~o?nK?jDk7(d5PSv&8vP`z#H?Ln+J1IKc34TVpoN` z(wzyqDoBcCnNa@5qO`e{Kb5YRvo%c-eZfx(*hi|+VlYCc=N=Rba|0~IkyFvxR$z?F5H26T!& zfq|rrBo?}C*!V~zs^yS%e@mQ~saLuLXN-rJr^}rtP9*+0XF@IS=!Vq3I%E2`qN5U~ zfiVX65m#uh%mdJTAfpO>0~8#vrkjBthYcOP4rEfK)Wz2L7CV>x72i5 zmw6J~(1TPdxh{Q}c!u8@?;mI!ZHnZr9!DBCGc=FqP=P*Qqfa z^a>u-H7WPFy6<22-ek(L*22kpnn9oeVB5f5Q!uv14;%0CR$W&4{vy+#)WzGzpUbSZ zD-5cd-FY384>x<(`>t&(m(mC{=64qc0?JIZusz9EimHkJ`QoM_EjNHX=hPK(^U$Is ze=c_+9Umd+whvgd|=AIDS=zFPdJ*Q2~fABcu z!4?N~X87r53e>$#_NMHNHcrxzv$4eFK}VgZO1ra zOWXbnu$)l4NT%0JysA`^JxmmjAJL;UGps8WFJ00I3>CQVH@WUQTlbX3e_=RFcYWMN zrLiC!@YyAn8tiGN;n4A zB?PGem$=;#MtX%^q(eDoc~*X*>bwMv{T^@P?&>Qi>jRM2$6wafJP zabdNjDJM!Lp#B$#Vh&e%0vv8wlP{)y+CVxm(ITYgAgMy18fFOrI&sDb`+{aCNH7%T#?hZ)9sly8#@kIK

#x~4rcmt2U%|komUzih^@q2{#=?Y z_Nefxr!GzBun)yIE3xlFMpAkr=n1GkN--4$!WW@+!rMZ5mod<&QbS1ps2 z={Uybt!j<&=u7$yU(?`BG(!#0HqrKfc+1QsZ?)^08nZ$R0T6=V?-H-jn zD7b+176%Qli!pZVS^8(g&o>t5xxD#i6ePN_MxLC+&8IJjCYe9FAP^kP0@+j#YeCv$6Q2*Lpi@rz=%Nta5**K3jr7>3J(Czkj}u1C!Q^HE(~=QKGroll2qG zljnjdWWN3kIOz0+zfYicObodhmwZ<``vJ+*os@09{bkM?g5DM#w8iP!d%%hEbI3W# z%v!LM4mK^oXanW+he}soFTm`XYYQKNn%KSPK88cWQ72_7*MISK>_F4S*ep$y{Upu# zXLH5h=U71In3kycu2jVFNV-O$)5~PZ+NFH1_Qd~7U+tUa zG`v9f>8-4*&j$N;fsJ6zox@L-mMoqo0LGsrN_M%+n{@C~!V9l`zgH}@tJ7IzlTZn1 zO*fhDZ>dcp`I6WCctuk9Nd}$wRgVp7k~R15^}$}{!+LG}a$sT`;$G*9Ru$tHr<_CeziN zH`CC2)J*S=_m{h^lFcOi+`mUOH*`g&X zad5VZJTp3RWy+~6p0WkTt6Vz%gy5%bVEY=Rn?iD3CA%8UV{dqnKU9MoRA@DRBdpx% z8iVVRUeTYuej;<${qK34oHphFce;C9JDtP2`=tvSb~Lv0SPDaUh5h%3GFP=~6LC#V z=iaa`w2Mu3zK#EnWb15$x?OH9wB}sFcP8oHaFc1Q_{T9hrZ1AMSywwLng0#f4Q$i; zkZkW=uk$$f)A`gmk57PqH<_TFdUc0hsxnkiCi97kT>0lVJ3v3Jm6PLgyz zJI!NQ#AKyAacvvMe^^K^Zy4NAo|OFj z?M3(R0tT^jhRdG|Y!7jPqa28H*~ux02NlYgw1o)pB^g)u+!qFQ!np-1o;?(ntn|zy z76>q$iYc4SQ86J-xvLrx)3mr%ucHCnz2ABcSs)xF_}Nsbq5#aH9!inQ=rkL+hi$9*MS8<~+dLPeU{g|N@{#21h(2qV3S@+D+cRLpiD^d`+pMPuLW19p` zWeCJrG<<}aJQS~E(vG-eMIT57*|H|{Tn;Ag*$$dG#b{wW; z)~clJq&;&lfV&T=M_0UK)yD90rUfNJmNq%PYIP=WT)q3t8v}aidP(>;RpqgoVhN8{zmkuqXBX;k!i%c2zr5#Qeu@zbqRD7l#sZ=e4-S9MjQQm-V8#elwCp{+e%zf`y6e&EH@dMDBrUWK)x z^Pk)w09c!m zF_ps=-+wiD>OcdVU2YU^?^L5@)^D$-yOh_ix9-|YooJxjVcJuVbLA^b830ZMZBwH( zKOPOQSFY&$+C}B4Je&v+U*rzzB1bTcw-UVT;MjE{0PTZEBdMnOnwvS6n~w)(L51+7 zz>55c$1)54q5U%v_pdh}EC_avV>kl_+9h?^Gv8L3KdpdG2|Z5A%VW6IoBvDf@P|6qR__bG1_X8H^9u%} z=e!(4yXC}AR5qChrwkubyp^ORE)P9;{$-7P;HP?ira>cwn!r8R9WWGAnd$r|=} z4bEC*0i0Qa7GzmKkF1qVmKNOFgmr`lbj~|%YDM# z=F$&Ecu;imuL3y_k-bAN@tMN|G4LHX`=jsU+r(#@UNQ$+`!B3d8j_#Lo#gfqi<0NK zKdqm0Bw5NCEvq{|aH7-~Q&hgl)PKw^XsL%Q5F7Eh3{iWe<54~rMtjv)MT@wyGLbH~ zChwOxo~{T}gS9}=)LVge-!470XE}XAg~rbKb2qN;mau8#OQE~7yx8xl5m_C)8mBT1 z92BPD3$LgUgF3msO>ZYH?T+8`jwIQRxfq7~{x7cHGaRlyYWu!IkPt+)=m`M z5z)dZqb5Y}(Q6RVdlJ2iVe~ROqxa6}y)$|lj56jp@ksM28{h+BOPS!9xN7ViXQg zu%Rcl4l>eLeE3cx1XKb4;QS`y+t=^UzK<1K_-(rChbZzcsxjT8G?R%;jH%Zp7Pg}$ zaD{{>`g;xqR+J)3hUp@?tx#z|8K)N)8|B$5z4z; zOD|jlSz|>EVpUrvhIa+rwQ;w7Lyy0;0v=X`-^JBQ^;Wz-xSk(b1U(0QNy=1}T3X=xV zVDg524Q}QVMa}_0PFS@UTEf?uuD{2f2qz?tNXho7JE$n=a@HK`@zQS!L%Jg2gY6*S z0?~NZ#M4vm1k^D;Vl2X^hUFL^pJ)58AtJb#(PxvqOl)0*%FdkoJr?}b*YJ067}8D- zusIx77B%#plC9O+cl3~gRN_-Mnpu1!G#gb7mGq;`_HP8&_HGZ#Z3Cui%j=f*%l=JM zf?nA+&-(UujwVKw5jX3A;f=xHbg*r_l~q~5MQT31Z|KD2e3s-!>@;VcaYx)SK?5<(Zv(iL@g-tOd%Fylg7`Obd;UquqZOLT&AK!6thU}&eo1?1mA5X6g( zp=)*6JDX{HRT#85;3~UB$!k4b3aW{C#_MW_al+~3tkFX)`hKo0EXW@4rH^)wMMR9i zv@-HnAGp+7Ev33v^*JzIuMT|s8l98^cGO%(g_-7Qz7XPem6mxCpm?;AYJdeq;c$0J z-I2V&IjWXm*Q9UJ#^>7t$V_^Aww6{OQ8m2EE7{4u&wM)xMi*k78}xtOvY_hJOWXdI zG;%`QEs*QWN58Jc&W~luNn&y^0;eSscdoioy7R@K|C$z+R#FsofpWEu@Dr$Dhs~mB zIRP%e7$PWc-Z$j1JQ2xIV6a~6CexuDPlwpWt_f{72*0;t7A61DMJ`pkJk22odt;+2 z#^!VTJOudDPdBu*fRp@(Og-?Bq)xTFziaS=Oae($Kpshd@xPw;B$>EBF#9%aNXtF3 zCm<5jLJs#mLr|t&WVUE;Gy80o%Z&+w;KF@E6M_HxTd67xn3$eWKXueMLl6Xa;VQ1c z{AS6i^UyYWO)9VtX6Cc!Jg-+db&e~W7j9Qw-L^%Em6eAfSXTlAc_Dn<)|nol$^1s3 z^l7F<+J(Cj0asPi&LD$f=rDJ72VgspMKiu0b&1)d&kQQJa4%dsn?NwJfB<;9*k@r1ek3Q9U7XBCi)kJexmF}_{@cg7w z%WZ~1{z%^7I)bd^w0GLh-Njf>z~;N)8=&V-&=Cj$0eTSo_*k`f0T|I%h6T!8Jd6$C z&j#9?`<~1=&@ibPe_cKZMLdJlR8HitUG0`t9j#wnSC@-oP3oM_<~Q2Uoa5VUk5(e( z%fDu@1y3>_;cJ`_i%RuU7r?OPu6?DwZccBx=Kp-5x*gfVEBa=Kl+k~6iUakJ`RjeM=iCr_?jqYCC^t+^dUpRQBcE;I{Ht{ohsxJI9tu%@hax z_S|{R5rfG!`%)4-7FQ3(ZKM5NAVRHpI!q&;-3rGpUjoASm7%oy$sj|3hHx zeI#^~m%0aH>7K3T)+~zE_t0Vi6vQW-b>p&arUI|OZa&b>f`61t0Y+ z4_*kCtmFp3z0S^oiSv$h%w#4&U4`bF$*pPuHCDh~m4J)`>XG6aSePY9@gCwH2TpXK2&&f;)QyG)eS_SYIrjtWZl@>jrI%0~P^Z`Ce- z_2%yad8G@vq`d6v&3bcntt}yN@U8)F`?RpC{N?~X^jub$#fCIVgvc1qq7;Ee44*cH zt#s+OnyLctgv<7A^@duS5+Kp&?jtlYKn;}Q;P9_fKI%}4l7n9>f;NZpeqNoO&;fr( zr((h3bwV@sz`Sl`_dk5pBsu%HtED@#)iyWJIWU%D#a1-kb)j@YH|Mp!W|BVX)rwV7 ze}{s%tgJot)tWxq>u9GaKLU%>&+5a*AH23*czjBuS2vK@yRybd4RlN-RGdzG7Q1); z(5Hn-{qbUU?a-|HuS+vo()t3H^m}LljUv^tGz4GfvZVJ{nB_J_1V$YeS)?;N zc;)^l75cDh#eL+DoXYNqq)r>UTrlu|(K!`>tcd29O|CHfdpVzeJ7MPTsV(+5ZZ`r? z*qfA(im{%rPmi%>FKrQ{0;Mm2qtWV?zP2_ddTp%4kj8rX7u?=Rt7%8p!*~)@_1hOJ;16;RT|qca$POB-tDTTd1O9`;yYg>;HJSHb{+UcL#(@NRCHslVm+lTYsg2GA4t`KxmQ z+`giaMx)@AVrU8q9)XEu-!qu^!kIaXNTZ7U{XoMzQE9p1!m-9PJzI`_QX+BeXez(r z^T5ecn7kHL4RA47uSm?f0V5M2GUP8Cd!qrPK@YX2UgYnHhT|q%;6B)r$fWsr)>sr_ ziIbyaU322ccL+2?)vn0v|Hq=8t97bNAJny965|uy;&yzIdR_5d>%HsahGfVNM9_eKgAJ6kgr9gOlRW zqu}N`0D|J>8M+u)Q%;47Z* zu?iHU|3B1exABL;b+u7(vmi}a3&-qz--6U-%Exf@i`#f{AR&m*R5PU4{L6$uLZzpS zL;ltqffVN(>K9F7%2pZ?tY8Qs`z5b90C)zoB~sXhivM^ZCM)``(zMj4&b7z9{JIyRX45^X-Lu)XqHy zn>vaR))8#F63UtvCv=E0l^nzUCK8OuI>Fi=`#7EerrP^eA^1(T`G#r3X?36_+0&z_ z{ihyUMmC!nxpr-PMB>VrRKchEL)Jf%lXl|2lk5m8bkxy#f>h3BQ4xnVjZY^y>?J0{ z{|Fq-s7M1tLT{R=IBtq-pD70#Abbfm>EWhq3z6&FEL%P)wN(4Fb8j_XKLl54%-*51 z{x~pnktJD;H zP^0k}87HC)bBdqTj?xGts4g7yZ-46Qvb6Hiror`ui2K{fOsxd z#(pBjAcl9A1j!m}?zH@s_1XU4(5;R~Ej#rAn|)(H+yW`B=FxMH^o0JU5T63^pEHZte$t4E)4D-i`XO?-1Lwl0basg>%uX4Z*v68TwE9$B0m{)3(?(iiv$< z{fYy69d4^o2Wcz>$M_7?6Kv+lnr{Cfkb|`?8cOM&Iq@xkBPjQEay&BYW~}lxn>|#D zEoISM2b!nveRZ0?(QY(}p>v{VI37bG;<(r)`^VUQD@8 zxe<+R-7YErw>5Lau7CkqKh~H9#itoNLH|ip9^wlCFtSKRd!y8GZ0>FQvN*j3fEDkW zU7qP670fL;_}2Bciv%AfCqoVXp7qRBHo@@iQNi%{vG0GY)3p7!?FxZgu89)X%5;zt z1gN?BF{;E(lMIcx0_qa;P5SON#a9{#0&-4m8_5Qq4ezIkwR_$cigQ9u41;~N3Lwmoz-PuSU@-i_<$Zea{2-V#9u{_+Ffh~vf^BIF+l>E91)Qbh zHn0q~Sl%T#@;-vxi=QUpaNE}rm-x(U;I<;mLG${7{{KeP0oLvRvgy9eqC53$ebyc! z@RiDPC`b>gRL=vLe0$Umzl<1U0v zH=zHar$dq7-YWY#?a#lDz||XD#onF&9Y$$t5L#@u<(F9B9S}5xexg4#)E>-I-{&l@ z1xx3E)GF&ZM>Iqnw&5io_TETk#ce{GXuHVtG9` zX+?pu>63-{P+eSALcfhR{O#d)lR#tM(W%D*$=?m%{J8>1cYSr*Gzv^m2zXoFIyV_% z)2E27|2y9pJ<)MQsM+P!3k0cz15c60-&%cxaLZG_M;d_w1QlN~WiR={`Y6ET&ux z{d`{D^n6uX2XA(#Fv>lj^4HTm&95SO`0FzUKtc8a$Y^A39hkDZBiaB@-tXzZRHP=g z1sJoM!XwXeeG;xdj!U1o^*~!D0hLGdu=4;GUrf) zIMOZlXog@Giiiy3o-ccUTP){-ek`4=@@ua(8$Bck6u%^pCp;EhwlH(x0)fcg-#t+k zr*Amt;3X+ElcY0yW&_dZ@pd8f|Abh;>l#vCZMzcgF>YX=oQs_{DSF@#dS zZa|nIfOJyh%ZkjSJ0m8Lx7E8=tJ|brqze27P0rNra&mIH^j2N!dO^Av7>&VNAWmV= z7k%@D#-{EJ&HJGE1uBUvcku$>nTSr7NsUI;%_z$Puqa6>x-UAY4k#Bmkwj36VF5Ke z-{jeQhMV`Bs%g?hi!P4E$d`wk{A(H$5BmuTG}vRS{UL$LvOcz#i$(yQPk#i_z8;9W zGx7u4IhmH?rd?Nac(uJn!MVLc*u(P1<@3fJ^GmnN?`OdstYKp-A!x~x8vEMP?Dxc2 zk`7un(ZFuAwI0M zUgwyhrV>?R9`_V|Rmw8;dg~_&$cJ(@I83~zFW%v$3gA6EVA0Hny@P=9D!`Xq*JY1- zy>umUH%%H?ku98y>hgf-uY~cW6+XMXx`mJ3>HZp*z<4`RT>SHQ%fKCP1p;@lr%6uJ z0^d|BrO!`KUP#}k1g$!2B(+tb<7y<03Ey+^CHjH})VAiA*ZORt~gE2q>SrW0g3wwUwn3;S8v4#m;g z^sH6sXW2Cx97;iVU0ClXFs4!CWD8@TX(TSYWlyMR4xD>3AxBE`JN`nEX`!7Qc2Pk( zGl?9%bH4!TeQYBFRB_Uw!$^PAr*1Z4)2M>6XE?sTNZRX*byI;SJw?jy#IVGNX17P! z69orS%wMYK4FZx@2Vm%R!>pbyH5FJ^GJ3}jO8$EJd{_X|+8yNatkK0tYq|W%TX(M0 zX5pTl)^1Ta6BZVq9`aIdsnmcTx znxspXq|P3e7*Kf6Th&m_mayMaXWJno^-oZ%X@Anq&&j>)>flQI1>mm7U)8qieGNl3 zgx~c7#~n2LtI$=?@DQDW6*H=4GzO{giOEBi9xcXk0*ET&2@G2Mm<=fgAX3O3H`wi;AuKdjt?->tb+>_1k+J#jqfa0wq+_TXWJgX^|G2UaVWXP&5E(zAAaYv_Xzwgqlbi%qK3mV zFf7w>-9Vr+Bab(-!ko}37g+uN`_*Hqf4le0=oF(&pfqc_@pX|lR)$+S_uUHfT1#EUr~y#IqXmkYl6 zZM4%^`$WOA(n)z=gK|^Pr+f$J<81%p$DriXvvd0QAFJ-% z8eYLdsb{HPcc`>f@tr(YX6_q&;hN`9+wai2%_edfR&9Q->r$(-qpbv`)i@d^NT_}B z%*rI6yKH4_(CJ*@Mmz1)zp=(jyW9Jtrvr1Y1Z8cSlib8v!uYx|f}q>svQ$zYJ~mnY zvrjY$R8rVpa&YT6?RjQxR@x;=^@FRC?=|B$$M$oUaPL))6n7m01iMb<;$?@$Y_$3m zeQjDZ|DrAgB|Y2pwLW*JyNcnNRIjM~k!Q#muMox+iPews3fRH%O82g}RFj4NJNTW` z?&0`9@nFS;_oe)TTr&Y$+#HurjK3m^fww5;n)Y8$Lw{jX`|&}xUGB*lOUyk53vo%S zXiJKg_SxtFs?mhhSHr|JdH`@b~Q-R?8z+&e6T+Ynvy8sfk`h7AG`7$Lx z^4DPT?<){{>;tVJ%%(;osVpjLx?FOB)dex9pJ(;(Bf?p`@Nmy5-@s`MV~3L)eyfny zenW3{!C9_n0mak9pb~js`#MbiF8#BLDQ^@z)?goPDfB{w!VN#TjRgQtF!9Z94!n## zKx_K=ob#(F{fP!WW7gBx+5#aJc-5bZrpUBvFPxKLPzl2FjeViey_i! z8)kzubFZ(ot}p7+ev|n%EE;XR=bqYi9kVqDM*n-0qjmEOGB z)WwQls6!pG(lG-UgwNsGB>^sil7Gf<1=WlGad6KF|AVgFG1fU#qOi50y+}%zX#7Ao zGJo|EfLNT5V00RN*TqO4fi;m|p3zoBA>H>A9xyYK-bvoYjKrV3GT)|mhx!~kP4i~W zPHx5Pu@?C{UFpG`G&#}r1*?y0QbvUSJZUL9y!jEK&8weYm-3}7n+R+2IV@M3hfjA= zW~=v-GJDY%JKnR_3-wI=YTpt;P?NLmdEwyn?2WC|%g0Xo^RJlK=H|O&vv7_zb=nsS zBj_GJXP*8dYgyDhZWJ{eG+Wusozmv=n1Dzx;=Z9#&}79Y%S`D{Y%fy;6;WE=jRbsY z*ZNgArjlh?in(;YGpMMODoa0yXDKcgdtO?n&V?<tOoH@ z@j51o-AC5BzY>h3G6l8jw)>;X?98?_kOiV2|6mvZYR+nA(WiDp!>U&R^5s=0K-%9a z$?cS+6M#G`_2)pXXtee?Q(c4o*AxfqT#2EK;o_F~+Pf~F2n}uxF@eCddEeXc07(dk zh$Q7M*kqH^vGwroL#+`C#K1wMF zyTucJ>g0||1bDHhWOWnOvZl_+sXgI?NSE@PbtLk2mc>?Ni?`L5POq#8`m4Ez$WCTd z0_fcrn2tfjU!BM6QTst6+PZDXvkEG_*0BUDBse+&GF5`-=XfC6spG zOto{k2u6H)dey6zI2o0$?MK5a*(#Mh5bX_5XR!sa40_HdvlZ`&GtRGL0;C916Y6Kf z($A;t5A$v{Tz?EiIsPimbw28&Iqdnyesmkx_x0z@!5`d;vXWP?30)zxA$$Ap_?1D} zZ^f0lQ!X>dLOQ53+Q8rGtxfuf?N8rt?X}M=$I@DzFfSkOZ;3q{JL0@_EC70PkP0Iu z+rri_D`l7{>dstp_aU!Wx7!sh=gcz=m+R_JSBVrRBuHRIwl`~N!Dk|}OYqJ}&%Zei zq03XJb8~kY1{``wKMT7%j_>5ATJ}A2-ly@+_$kmkSt)0@$Tr16k$5i+Gl**~LOs>J z9X%9CN6+%4_s}W*q*wH#uS1}9J*Qu@M`w%VMRsZJ&OM<9H_^d7s)SZrj%+ji6baGO zV^R6a1Lh>Wq4c80SE_VMU&{4ijH%x`dZ6U`CtS`z1v2mo>8qmAlkaD-(2{nu`vqt+ zpv4@=@C%>^?Yw)IQyL48@%o{5gX%caOO zJaamaUNlrVP&=4TD*)efZ5U=*32ar0N+(L*o095s<~|1~p`lwJRx>BobZhFr;7r~m zNCCj7IIjn2i6Odo&Bb?~4C(?Mc1zX`&x&CtzT@kzhi1cEu-}=_76Ui!F!VOcHs60kGT9MrpcGFV>vL|=5Kf@lGB#oD`uH@e{gnu5U+Z)j)NMju>zZ5%MEgYyhsg*2c z(OmF!1tK9FUd7iFC*uHZKhN<|Va&=KOUE9*;19-at@Ccn{+H-WnG`PLLNTK5cKx?@ z?|_N(OMr#x4mJXH=Oz2^V*iDwS$y@}U8&VRx7wnf?c}st_Lb5P-7LPQ@V$OIMFv%y z!hOF|%{q}QcQQA~ZsJm#V+2$DaB5NNnQE(dd2C_st%;o@@UfMWyQHJvOw;1t_GfDs zNS-dvoQI)`*ZDC6lyWCzPdX~GkqlE$IFyp(ac`^wJNbib8)e(Sn8m-oA7ahPKESy4 zb6;4^oYjh?a6y6{ElCDgts+`1aiF_K&U-`CZrjI!P`nxrgNAhHh@Ic_K$iqkJpYV( zufv^KK|VBhTI9nucDl7l<_e#qJa@j;F#U*}dN-*nS%4E0mv)I0XK*VU@z0sn-_~#1 zh6JKDKIV@v_atvzUC6k|o|TTMhW(;`it1)fxU^s6H(U-*s$<_5@S2IeNK1Ml=X>w4 zV`Lusw|=?B!2Ig=?jC}vQ)UZn!ON}%+IwE*s6eq#lch3CAvZTqHz_rxeirU* zfE0>Jr<_W#-WWv==TSlEwOm0w+TO_N^rKtFKEQenujXp5ji>c&6x;46DIC;loN}g( zzRT-=r0#yQkoRtHs25Npae{_I@rEAgc~)|?VuRj>7ok~chUT6E@}?ohZnIKqE;$}_ zxok!$k>Mu*r8Ko&zOT?`0t(bQv>;h4DNu~+Ms65&p|;{_AHGZ*|5JLvsILcM4!x8= z2b!WEX7O1kqpZBDHodB zQrLLrdW^W;o>V~WXrn<(Dbg}5TmXu4)JrForu>L*~cR zUX^6C^{S~D$uAl)zCzXKb$aOh&|&}wKb+R0jHfZcw+4@2Z4A@-+~7)m`!FZ?`q<-x z&*P@FTZkpqD#kWsQ4DqD@(mLOVjq`?_bvu^is-eDUw_l3W>|}>6~{y!0Pj;KA>Y4X zC&t4NAsZOx+APlT90vG-?f}K*-jGEnV^_7FbI>TMsapU)0^dAA!?Bq;*wv;_S}z-P zAJQUwxTs6OsZD-G6B44aO;IrSR{Q+{#Z4@iPhzBAe~$&r?6jKTTYS`7WagHag*pQJ z_N?pgmPpp$osRwCrdVwEsC&qa0CIXx5z;LeSzq;xa0i%1Fss&IV2xb7xJr|KiB^m z5wnxAwq41rPYRo@O@xxOUJNXnf}3~Zvqn?nC4KShFcsNz_ofVacSXt{*fBh~CK5}h zx^%W3NTL2z>~+RZRT&zapn2@9)|YLYt$_%2S|}BSehm)-CLQ3IT_R&l4!^A$KsxDU z@3QGnKY5ZZC2#9lQw>r2^E;GVUMc>u&_**)JTz>80{*g)9&x;2N;-lf`YyHVT*NXP*}alQsH;7j0^lVDBOt05+O2H0m54dFzT< zfS{oiiHXAGJ61|}aApZmdbQ4fXOAX{FTT9+W#_q?7OG}t}gQJ1+wOk)2uJV*Mi8>o=z9k$L4hb~vUcEm@O z#H>0O5ea)6j4K^wsy&G|CH5BGxA!*)rE0OpNUGPVHzAzsVY9N@~9y!S{Ff@)eXR1)+v75i;U`r~pQEmEG zurmVuR6xml92p}OKbB>OD`11nGd9ib+gaMrt~vT;QGT-B1Noq#g)Zb~WixQk6gr>NZNG zP{zFl@a19_bcRdgmfb&}a&`a$ih;m&b^7h%)-;e}tJ2T)WV6H(B}Ql-er5Iz$n^rY zh9Lkdv1KPZ{9J>5sRJi|C5NcXpg@OO$TgkIm6oDT%<^|a?Bd3R$|0k+zHht#b;f$c zRfFuYxG;~W%=pKzrcj){Y?jYFV!Fh*dtiI;EOb+o6jlY|Z+#I`GeIhRm@MXm*RSRr zZ*)EqSC)DBLI6we!pJPeRqK@1Q8Cx9rQP$-4$Hx7JTGO=2Sll8anxT)VOO`^_+Yoa zrgUH=J3R19GzEycA8dGLOCtirC|lY~uiyEc`yUrd^1VWv2Ct7;ym?8VpnrT!48TI| z7R@iqTog|8^UhIG-jw5v5D0KCe>lk@p+YRZL1fBmaa#A7Tc~udDw$>dWahm}_|0(R z9S1+gZpoNZUSxpC5@_IH=*1qo7{zg`9K6zxb2X|iz@;GLS7oY;1>PXxpE*O}rZtCe zKu=v=Pg4{7*u+)ICvF;O0$bofa!wWymtoln28Lb1fEK}#)(C=qIhk*dsCmAcy|Uu$ zS?0d(-+HRDaj z#c7&aKdY-Zv5D(Hb$At$`#G8VV&G2zPT*C{I;)JNp1JvCMFBfO(h4#q((yy-()+4a zU?gO6rHvc8mjS-#E-5jq8_PO`2?Z=crXqWYZ?ZhMRyFs-aH=wdY&()`ifpz4?lI*hcDuLsxLCvO zGj)3lL-r&br0cUqstbCW>2HT#V^J_sC(+XSG`gW%0hy{WeDQ7r!(*HiZqGb|PBZ;L zlsAoqRJK+z$hmLK-HG(J;!&L81?bEvD*PHBjwuEFd7P}UH47sf<*>QAycYqyi=Q~> z$ErDpu_oX^hXdB!3~gOud+Z9V07u zN)qiLE5PBmw9Fqh_>8IeRcJ(SzZKg;bGp)X9u(68LxK^*ri)Fx9rL`{mrs75rs`yS z{d}N}SIyBXu!kiD(Y;E_y8o^34KMGih)JKhwJW56{`E?gt(!CAG1arhL=n34>LfqN zP%6u;UF4(G&g90+2f5-$VRNcm#rbB&?@$76?(3@Jyf#embCm=N=6 zV#On4QEA?(-)ctCP781q2-t$8VUc^&Ws5-14zat^1UlI6a2ZLB6gpWKjR_vq%}m-H zZOP5KSiwK?&cd7U2EYEHEh8|Iz$(+DdA~yw;V2kar60_%28?;x*ru$MkvqMvJ<`p7cC0q!K;~yK1QFAZu_K zimJ)IB%6-f@LkmZIq1!&L7X=!tmGqchLkMMt{vf90y1%3c4XANT7zMLmF6r_8AunZ znWrCzCPvcfc&b8N2t^k1-s-Qd;i75)zYTc+D6YoCS}4(*?CoE8Fkbb~gQ@t=SKSV) zY5m(ZCEF~=Cjns(RA6H<4f_5ap%r76D=C%W@{}~{?oarPMoO?h*G^B6IyY9}z~M2& zS7yJqVav&^slJZIkHGT!hBP+Z-R(EnW(7>5BzdN_@8De%3J-aw#E`9~htRO{(4`Sf z<7L=`?^;8Bt%v;z$AT>>I9y(+9ZLkZq^FC(4r6UizbA@VWIw%%mP?1ABm2@T#fLTB zK1zk=1$^`1*tSC#u9q*T$WlbX9R@hl8QJ#E)D?B{NXNV&gDFw!^_8*%)cIg3`$6UR z!8R<$j9K=8*7v2OdIU26>oK1~xc;wju$T1fy}(a zr)D2yUCx3h38Ok#!X5;-g$R_4%(<$Z3e>W@&b&r0`dRc6?#f5!1nwNr3KJS`{MbMA zY}>Qp*$ih*aJ}X4mq21qOYTHm`^*nFRm)hlKb$!@$3Y*=fVNsv;wO4tN|SkWR|*P~ z0fp(^LrkCd;dsf+q@9a9*M{>&@6& z$KUv6)VXH?eKeQr4dKmoj*O79#nQW|pJ0o}+7<5a2~18J=&9P#5L(Fihvpx<1tYh9 z731&UYy{dI%}%17^fMKYUBoAyM4!Ms^YXn<#Wu4v6OP90tug$^pI$TfNa!64pmqdC zgi_FqA2O(|0c6y%_g8dE(z7dOR#s9P_9FrLg%TCT;@MXqf-F0^RCG_d3Si=6S~w@O z^XYvu$K`71H9nw!$agzaPAYZXfM23=L8e=6_P_ff{xyuvG>{hQb~Y_Dk;J6SUNMnf zu**0&1A+mhz|p0Hoy}==hJqo_;7#)W*^#fI9)Nt%Q0%o_6j1J%F)F2T^R~ST7ATMm z!;{D|(F|>kBvC^Jc9WNTs^rs7-}F?p=~8AscIaY48fy0-;b@etk#sH6 z!F$Ba93alNX5CV}T9&HqyR@~LxYcy`^e3FLh;|9C)8Klpq@zm05$>wV)j+4|#@t}vnDyT7A$lAi?vUbB+?t2jMF8rr;<%@ja?7!);A z%FS{r`{wO0!aUT)JHs6Q3S6%FIEu=fu?DRyJZQ@VM{lPS=bJN{+a=a5PgzC87-2o; zSDv3`djIen(AFfsV_{wWa>JeB!v1qmwbJ=FgAwB|KroZum?v*aV401-oD5kn%MfP8 zi#8oRzsD1-BLl!?d5m}ObxQgRdfPC(=|6KT3kj_(n~niVeXecX zW#~M~^}I4>oj)IooQi2AbbmveY)N(fIaXw7l_I(MznkaGo6#v|&6%@lACP4FK^~{& z-(0T`55#pg&ESny$Wz9%?<2D!?~}<~dS4A(E?vBJbS`(WF=BHq{Vf&0pHzwHSlHDA z2G|+vZh#CJisztAKBsH~cC*6qBmFeb3- zlz_lPupagAi_-|d=%(=`oY-{a~h0S@DK(OIW* zpZ2Ed$35v;=$~4cwU!pF|G9_=I{AePoU%3-@Fcq|Z>DJjnK!?qfNID|tg*~85N$iF z4X}=jbXwg?z8~;+h-y!lIN-0)>L!gWcqcBNS^Hl+L$hC^_@}*14t@aWs(=&`#?3>1DU%(_S}aF0+(EQ;A#MC~?z~C(L@iBS4AEhuMbML>ffddymn+00oIy^RHP>)yg1H6JAVQ! z;us+L>8{aK{ko6K5xgWLp5SiZdM~+QVJmu8^z$hbWLqeB#V&CS5TUaW{uDh#HIKWsAYVm2T>1NB`7@KR#`k9#nsaq* z-fOkwT{K|rnUTHqjG^F%XrGc-RyVj(xD<*8za@5xtv*S%$9$S1Eqr>G-mM>cGbc;$ z1vJaUuG$1C;|hyZwc-o)duG?SmoC<4>)puFs&`7`dFek`v>r|Pa5-ncCe}wWPjJoy zq_DD?$x?nIWu5^7X3`>9cr8L8B<18$3%DpdR2HpUQdp6PQ@R1s(dMm1sVZ;g2&eS! zd}LM>rmH+i(7Y1koO9 zB*35Au{)d0^w*~Q#c&YAIeG=pvQS{_G9s>v?Zd+JY)CW=W&_GoqLCw|vE3rL#QPpq4bFwmg=))q!Nn)t})} zp?w#+lm)sn0HX2x!q#gET*|0Qu3SDI?FWsnD0zH|mu+Xw`~wf#n#4SP7;JT_;b3-Z z7paSJ|CtZ4m*?)Bhn@Y;?~d5k;BV=|joUEzuZog=V2?Dg$4=?XhE>euzIa;T!43J@ z!)N|IbwoFTPW}TyF5r|k_}bS>#l{>kcKxlXg`Z{3(szNLCvJXeg-MK_$m(xbB-VzM zwE;&~o?I=V{<-$LryO=&GjHP2%Hk6yFH=7IB|LC$*CL}Dw`Xi)Q*QFD!!I4tcSO0) zGWZ=V1BI&_YPaO@%53)N{p8SBj9ae6tr*Js;rxyB^ycj0-W3gq+o-b*-g9-0iXcol zYsa59IBqd@<{wRRA93v&SfEHeLd}+Me7_?+w?kaNuV49Op(rtKs&IZ_iyg3{4e^4j zmZvoLlis)S9kJVo8{q|KFh{{XR95htVtq?2QX8-A8i&(3tT% zKA&dWm#|-4Ik^t?_aC%}6%vhCh^$4n>5g}H(C>MEtk4as<^pd^{!mKOyIAbnOK2eU zwYmtjX0bF>D7`E+Pa~=<$z05J;QVbth?cxRLYn?Omm(~7^P6!=^*@0faXeKy&OemLl_evcG2wIR#DUg1WE^Vq2(;bVHez|hq6I4KM{IrllFJgq^@nRA z(hh!0uxZj1X=f&bo9+Xiqy5rBDF$Ny6qXhvhNWA{P~&16x)b;$%H6cD8`(Eu5zPpB z86+mQ>MVX%eR**?Yg3aqFnd?hB2q-T(Tf$RbRMRiQF<-F(?W7LiL6U!nq#bD2!Oq(1`#yLOBHN$wt_G|Dqp=M~W2&c3HGw<#;^${b4vHE+ zoMEAbs7k<@6a%d8XuvQH!k!8Fn9|oy11;yuH)mk%^bbS59Y^E+8@DD9@(^Q?3uZp4 zX}B;n^9C0!PB5Ud&cDChinWzLPl{X_6=ppg7hWM!#KQ#=nICdm)s0l|`Apjxh0S%8 z)=C^Za;?mnsSL^|%Vi=t82&?4%JrX;$+##ebgI;2omaS(;8avkm(A+FP%Iw^WsJI& zASS8{jVf@9b;$G2*luQ|+)L-$C7vOs=ZHS*AjacTQO}rSCnyz{4^)zm8d9 z-1g;{@|k=tJr7I4;;kl{Y_YXIM|VXQu%(~n+I@|s=-G%tLZ*O5A>z~a5*&|8_fLe} zKVeR*l}QhQJ$wx?&vu%`v$++vcC-4_3wx~?4)<6155av*rUN4UNV%g@zY5pPXNeV=9gK^(xDZd!+P!e5N7|>njwR$Y=Wc zEw-oF=7Wq=>xi<3<3K)&n+tZg@Sznr2=~46qbvhI6cnjcle=)tm*D(;z{*GzQJ$j4 z=7@X?#eCu$36|EhI_1!h$ccrF?mOc|o+dQi6}QsKtds+{J4_Cce|c3!^*0_)QgrpdrRnkNOb%dJSUytHDg z^p7dOaxvo0MJ_MyD^1#%xOSIcq()pUhi$bis2MJ~qLK zgkg|DsHyd;1r#L8y;$`gw{Y124oIn;t)vG;GGm&VQIieI++~XQ2|VjVno>uoi%F0KIWnb}Xh__xZ2RB+D)Q+h8sLJa zp-w5hK~UpVDsKi)S^}Ty+5g!6c~)Z7r2)IMfgNaIxeU)Hfy`Fgz(i(ebY&7SDCvf! z0QHj$AdFYf;s!B2R=rI$CIqrnW@-BUX5F8vPvY@bg;kH^KkWapKC52F%}qiW$N9e8 zTq**!nxn(G$+>xvWyk`=uaUvo8(b!cyiUfc5o7ujF8-GH1VI0Ih`XG5(qGP5lovnP ze;4&AtId~T^J^0o&qN-`XonxDHJT>)m0@)%z^!@Lva$}zEOsCm3MZ_o6Eu;VU#EQ;StUxRosYfGi;=2Ejzc%_v>%MW!X!K-a! zqV%S&BN7b$&N>rOgho#ot(d6{W6fP>^c`QHcBVf?Y+-XG)Tp6OzG25#IpFR|a#WVd zD|=JedS8A_RNIrt`U`}o;j+XxC?48{y@L=g-hMgAi?6sentcznuvw&GvU@8uo&cQw zZy>=h25(F2Qcn#2e1wbd@Jso3F@HDHA@F9v16pgL;Rz zLD+}%t3?yzzqZC8=t*s+J_A=Jfvj{~JGU~*!6xz3SgsZ#k9Q@3JMmDQECsd?iU_PJ zxU>CpHD-O}^M{20UwQ8t)@0MQjmCls2;72*QbnamReG<|d+#7Z5D2}58iJyrNUzeR z_fA45BGP*ap@$9u0#ZT`?07%#v-h#TAK(7|@BB%QD|5}PX=~24X4W}e{s7Jbxv;d; zMLxo=rB4(desN%+#d) zDnt2jS*3q0`3^B&>a&AM?*3jh&cPI~R2PdbeE#7&fIwXN<*z>)ePkLGp__cr@Na+q zL6)gR`0<_N1VT)(hI0svGyiuUGcrt}5FGc*lVF(Vn<6$n~S01BOmT^aEpDeHhENw|=fW z$Qo;U@&Gw53M;}7WOKhFSS4X|NR!B<;V!0=WyE`EzJC+y4)mK5-16%=_fc_6^p!8O z<*6q-WxQOW#$7r$1rU-1c~+*#r`)=BHo^e$aUJelOxm;VIz(~l=o5+i0& z7m<((3dZd}Pb6HpgD2X;7BNjh;8LOt$`GChx$~HIc9e+&$IwHR8c@^0rrAg^To9&!mPEW>^6Cjt7l3qW?1sKYS^4Ai4@9Z~xTH z(3RY($TWN53GNo-1xBtVNsyk*^v-0K)c(ScFDPOkiTS&eHiPZtp6-XlkLS0ai;$Z8 zZtt@^{=WZc?wu?j!Lyq*_jZ&`$mHSm(cq)!sxPCzssFt|CP#@+A1hFl_3_PA%d>q-V#!Xe?PD^<&W<(Evc(uLE{g1r;0l5+kvw#_&7F6Q^wzaEX8#Y3)Uvb zXW>=dy6n-!kvpQNM!9FoTvMObL6o#J49YN*5s}f*c&Fae(+^fiZrEe}3otoIc9Tcz zfkzW9-+by_%OyT?$YM!{^r40v3os2h~2a@~N zn@O(>HV+^49kzTM0s+=zh?SO?pHime!}$Ae)&jTPI(u*} zI0}53yR9m+aU6%*CV}ucepOQpwF_3W*+`n=YbOu}Fcb?Gx?d>$#~-!?JFC+k2$O8P zT&@{5`Vn=r(kCU~bmXQ2QY0$fhKoW&Lz<*;{DZs_gYWf;*nt!kavamu)bHkapt%FS z3$D|EPd&e>*`7^Q_AU%r1gTkD} zeV~?gOyab#(+fdX&x9k0Vu`55FT*{!+R<8Vs4!YX@iAD<0#(ZEh(}eSX*Elx>NAo# z4aUx+Syv%@+b@LeRVRZOb?Odb_w_q0(skgNIz#RpXb>u~XjI!OB;de=%odSzxdQVf zNY5~wnNJ{NuoYS!czKY0E~p!FMZgCHiot)mD&AdW_jsp%oN{8Hij~9X#Xo{48Qiul zA1f=u+=~c+Pdb!l{mn}@?B(0NMi`(MM%R0a;UBV@@$S~=ai2n)CVqH40dJMbW@jf$T z5tJLQ&1FIQi=@@^q5WEsq0p-TqPy9M5VJ{A*YT&K538nH#-u zp}QiJZI<8CB0tJ7T?`j5A4CDY(Ata|AasEgA3uk5m*uy=x2OI+oysP7uZ)81q~pxh zx}$Y`G#hh6Fo!cvFYp-_+&I!?$!t0~79#XF#;w;vxc7BC!deG%@n8;yBM{&rOOj|L zw=v!I>sw3qZIAv*&Vvhjc@~7++u_(@Ycs1Zbyc}+KcK$e4K2-*Fu>hxX&?0`7UEej zDyWa+H{C>>EM8-1rH;T1utB(bs7^$kZ=7x zV3dQ8ot9Z*vbfo<9rs+|6=n0L;&UNF1&V7m=RfIeTzw^Na#czenl|1ZYNtU625X43 z_ig&Ct@0qC4~2}fia|;56hcDp{+U01V*;nI>Cb>hukpI_kpKA6knk|!kv-aCklkcR zEa`J3k3w6?)?O^~-V;!LV#_Vtirh zRAUVLIg(A}oP+P18DX&2a?Y;g<{lf8QZX!@%SKs!^malR!0O^Nhd)N6@;lZ^=hi=j z9#%a=5PN8Uazg9SA2(|A_J$eu5{sC>?QDvSoZ!AiTc{wt_I0GL?oEvOY+P`B8Kx*B zF2YM>q|5Xc?$=}jW8YZeedzHyZ9{tY32)7hJxRDaF8D5ihbBxb^=qMKMhGwCxul~V z>;zW+>itP%!%1T8{$koj-?2$@`ll3YJ-_j~JvQ0^;s;fOI%6;F2?+6ks`=v1HyD9o zoDluR8oNT`bhI+UXn37y*r}-8J{CVG(u$K6d63e0bhnJK6VDcE5Ixn!?efVe37O;^ zOFvk*;dtWI?iq0B9Z*(p3far0AnvQHd-GUaEr(GNFJ)XylE0 zc&Fg-aSZmVLsJp0X4ZE+U}V%UW^AIhOB3NzNYG4g_MZ;@^0yOMwYpE>BmP&L#(iwI zPjW6VHXZdYF27q7MzYg0L@J@sL^xuNk8F#Z){u55QL8g{w=1`$msI>HfZmWzs{bcm z2y{|PR46zWUNvm=WsVYjLTx2`yR+T#=?(94V!rPW#feB+urIjUDy5;*r9M&FdERBW zT%AJxhFp1^J)c+oPD_8*D5`VQHQNCtMu;eM*4xx<{a3(Y&6zWue?!;3x`BjSUTiEZi42e_lcTMnNFGB2|B(=o4&;qRA~FK=SfP&>eYMR z(TpMEhHmsEP|Nd~Cj)2w?*-zw+o^aZu%eJ`zQyXjW%fT-``9Edf2={)Bs5Y3=+ z^Yjy%A|9Je-Al@4r*z&!yxhP$| zG-i`0`x@cGSWp7(A!~i-rwL<|*BdPy(u#I0Cb^npnk97Gwh;Mp36XO_;YlhwW1RVP z^e-5EJcgD|ABjXiVgEcoqXPe0;zvBYah_u`wpZrj!oJXn{#&E6^!tr=%NQ5-t*<3~ zXR>RX4dj5}$$!2k1@%9_9)$NglNR2RA$edp{zgMSmkd&!D33iB5K=~wvkC~=^$oR- zR)Otw#@9>Ur@3Qma})cduzai6I6p(F3Qjj03Da_h)Y;xly0j-GN~x z@$W4fZaWI8?=VeJ=0a`Fey%!MK|@1jCaUX#Mp(8c2VLq_i$)H}MO;3pF)(5^G~P(H zIdi(Vg{gZuU?BRg8o0TjT{;EiA}POaF#i~J0}!JMXQd@?HiL@n!#|JDOWb%$eXrC( zF0IFL(=jXd31%i`sE}|a%q?ApLs}N;W}TtEX5KMK*XpQ-xX0Q`s?#ho?aUinXEy0g z&%C!Qxe)Lpd!sbz;TveE0xx@AA&@S6T= zWn-iN2vt#c1G#6K<5_uJK>-LpdOIwR?Ash|$cK|W{Y((!neg*7T(GCP0w3Er>4z*) zm*n&r&t&WgwmjPPnc4S7`5?q0$MmVnU18Rer^BUzE;Fd7pQn41HVtH33z{|7QXMkX zm2uFIise%b_m*5~~BnoQK};^=cO!Eygau>-VsSJhT&YT)4DJTlmhQ zm^b$81=cmYYb!ODne_I+ zZl}LK8#=HBrqEG};=e&wi?{eHm~H@+IfJ7CW zbor%6(@+fnGLkA?D9jtHhjJ*`*!r`9i zHcP8DZJ>J7nt2n*ygPR-Sw&K1E{4uEYka+r3N4P2z`la=vsg!43maqjq+s9V3!1h0 z7IA;Uc_%84( zm1HV9lm2IBSZZ+k7>LdQYkyVv-qCUJA}QrF6V&?&G0mf>@xrHPuH{`@|BTZc8wr{u z;xd6oJ|O@g92hWJ|P807hp};ogreR zrj1qTbjS2GZY*EE{9zO48F*;MZWw=g7W!@V8tiNB?kcYwwT&N@MaQY0?<9}4;rMn= zO&jyjQW)E$@$?na?L3fO67dh0Hy>?ju3E3EnGZtxMm?_kmJUKRa-F3C0Fm445L8>% zc6r-fruJ8WJW4|CAHqS({=ivge2%oYi;KP zC{JlaayL^0gA0Mp*08C8cLw92LFv4WCe|}hM+J%m*fH6D$}-DAKXGT4)kbB>bZQ=s;zP~*s?y*M(M)1wL!N4Q-`Sr zuk+}BPh-wdFVwtJwW6z$A565m0r<&0x)<}DdhGkITj+mOXpV2NkII52Yuzz_HqLka z%;@GW&vw4mCS>~gXVw9sz7#yTPq1SlewZ4Y_-)u6w3llLnNr&VdFGW8=MZ6O2c27w zc;lx?zc@#0pKqO3y9lfwiEUawM&~;pT9wYQPN{lC+S3J+q zQSM^?aV&Y<^VY!?Z*KUfliqGhNDD0#w)vQ8MmxXiprZo=kAo+YbJ{J|lF;>Y31JOdE}QFGpb@QQvq#$Htb*rU z%m%Lkx~852axr|cMc7@jf?z9{J#fNWp7PwDL3>IAVMM}jxr{F~dWXJRCUB%w9tHlw z`332D$8XD*WI)GqYYJJ<#n_=M=sfg$v1aBvgA7s+;B3VyLe#!lw^3RgKAd#f;w0?l zn{)-t4aARm2z#xD-rr`EhF~~^Zv)D|^=;`?D_LF`B${9SX+4;TuAol2Xmaw zhksb^7$iE;o1(pwhr$VkuKFS3W+2?+lYk$<7EcY_L&kt@pDpjv7`_DXv%PbCX6>4ks;?kVgQ?q74QclS& zO{*`&oe{lpW}x`Jg9>8m=9#Xlog&byVRkMip&hWzii7ur>twT7iyrO3tXYojN?;b5 zchX5mRzr4?oWsU>Y6%5fmQ_&Cg^*Zv$yT%bGR-PtH~P9aP0#cF3o@up+iUIJ%l%Fl z_C}@!ys+QS2f|OwiWdk~k09A7uJhleEsQ-^GybCaEBA#T?p<~B#E%LKmXGX~7sr!| zUC)tq1~_0N$6Ae!NJ;D3q_Df7<8PrFFrNk~K$e|gz;=`z&^u%<@F8a@OUHSy4KJCG zJYb}$sEm;X7CYHC9XETamB`3Xc7Chqb=$R0F;noBfA@uV3RmSbl$UgFAUCgph5H!vs}Z84kX3WwW(#0!ngw6DWe=V)0n((+-)C{^XAI0w3nmw%+Ay>ZcjJVe8KAK zc#M|NG;b4;n8V7L8}j~5%)>0LL3Rrf^D+_JY(cUW2plP)eGT@UV$*m7hR8ZEmJPBJ zQC6oTWC{@MP`d~2GzmEed*a3ioqlD>hw~U#5kRdb9~$cI{Ti0yXpRpwd3`+yjc@Br zub)YBA9D0}INT^}Ym@DQxL07tPlzEQ;x)6`9q0YSMP)aYA8?c^DP)^pu8H}1bXBg! zh6S-grLW(KPHQ-D7m{IsV7u{ozrA2}jAet8*s5=~3EH0|u7zUP#Xg3i;9LI5;PBD< zG+T!_o1o!__7T0&^-Z-Dl}To+IK;lAwt?g?jkLmDQa@(V(Y$W8wX*&wA9)=bs|`U< zLxv8G;a2?IkCoMlcEUWty9^Fi@NzsPVGt-PiN6Dm=MVEoZX5Hq32VAQx%{Y>lkOM@ z9;a#W9(eUzYzPp{ok6`j&22@qSt4$pc6CvROV5_=Z%W!kZUPvZs(tFQE<2YK#V zA_<^a7KpQVEQ&XEzOzG;Q0^!A799%Rjj@_*)#mF+@i?iqs$2PG#~^iyc~=awVo%(| z#!1eC@+)3*HJ`i>SNt^-s(3h=r}0%`i;gbe)2_fa?WduI6`S{Z<0tj2Iz(=2;!C>Y zop>=%L>y^!a{VKx1oAUc-Ix>>rky$FjDXeNMbC@AUipBYh`lRh=)4+_l*)pM@kE8^^|E&XF7?W-^o3eTkkm%X`hqC9ElGW~7(tzX@H#Dx|GuZ?rwLt1+m4S9Yl&ae05<~?ufsb&HLp=6Cu@ZxoD38(X9qVC;C%SEDH0Z z;}Q5ax!o+tnQSFIZNSEdH?EdV6rHAM*;xsbMv%NC=(ry` zZ11;ui8c~e27?RkxBN(^3;8kC=3Uuxb->YB;}nd4ciWjM5qUH=$yo}$?!VKB8QT~P zaO=6Ff#+O7s!+I|rd#9JHh3dn*#Lgv9vDVv>whvp)`9|x&-2ba8Ap%*Sc|k)Nsp6W z@VDm48@KN3N-A{z*2I5#k9c^vys>mY%TonQT^j{fB<6ps^&1`a%Dfxq308fGUN&x@>=2zt>R>L2Xw_ST6GMN3IFHcx~Z?0{A8}&_!HJ@_WuaC4YgF5D2-_H$5>P7nGnT@ zF&`>@LCAIYOlH7z%}|S>rdq=l*!-<{Ntb(=AD*%mPO}Evp(B&k|I>#aP_R=L_nF_c z%RPV$keVP7;2)8{jZ0|s#ed5M06xh7Z7=}fHo5ixBrqpCeLT|!LTrO9GDtjh;ZF2) zXMQHSwe<^R+2uCS0-EFZ{wix%14AV4f^Pv|WdHmAU@)PR{$>ai9Gu{RDz*t9GEB#L z=FGjaPB%ASW&J^B!mR~)9~C&lf+okcd8N0a?lbmEVpPu-xLL-;$2ApW z1oitaMN3y(ws~S)z`nIDm$_LzmxA89ZbKrC(1EXGn#aGfvnqDzV{7ysy$;5+i}6a; zT{5FS8S!V+AXp5d!}%AZ-%6~*!9<@}tR`C@tiA!Sk1vsUg{?f0g|ywjIIdqiABJm5 zkpo7=1j?RSl#yHyOICgo2JK~jUb=ub2(%1GDa{@P`JQW`eegpiqDv~gP^W;C)-&VN ztNaUR>5oo&r^uA(u3_|I&l zi+&A?0~$6P?2YsVOt8oy?&QVcHcn`@IU z&oMX+W;X~HZ8wo%h^sTqBTt z?hNv;eGoqWy*HcgWQ~IB2o3sr3d1dNieVPiG#6S-z#yo6e%W*?ZzK5T3_ZR}n%_)Qq%GNL+Kg@f9GA?^wD+gL zz@RTlVvk<5j=NQQTm|lgI~R7IdB$*<*V~5w1et}uXfebiI^d~&=p~uba$lV`>H7FN z|L#{lWWfsujXY+yYPrX*Ng8PyA{t_Z?lx`sshc`|bdlBboCzG~Vun)f)`?$T*E?;6 z3%yrpoRfey-`_D|<4j)bAEba7+*x-@I^z!_9Bq5lmt=6}ft5t1y6;}yD^m5tent0R z<|mF_lv#V=HCV<1_DK9zvr%u(zm(1Srp`f7L}RVY*61Mgj+qqM-4RB=4dJze0VcyQ z>E3PT;^%67Zjd;XBO_r{gi;^;`SafPnwm}J136ey&*il5WnAqg>Hsfo?_|N5b_XKQ z*wD~C!AelE_6(npUUdjfNB&xtWX^E71FU)bMZNrG*xXF39li8p^g42vx|V4^xTDRv zIj+_4Onfd>@H?bXBgp2&qQ)H%Y!mfY69WME=!Z=Hmzn?nsT%$_31mzImevq`*KMt1 zWH4vYq(R?`*7YR8odC!aCMdqeFPk(sXBZ-a9RB2HNpBF$HUN1<70IHWGrA2gB;$;( z?vHe6d}=7>4e@FvN5(pZmGVfV!FSWTX*8uxt`4_z=e7HFbnXBa#|bORV$}t}vuuk5Dn=OsW^7{%Kf*Bkwi+`e2g-4oqB`mC1J&ztG5 z6Lj#c(Y;@j70iIgX~Mbd#PM=_LFbRDo25D`uQMkmc3>;1ToMHDi*!P*8IKA@D;ePb3R*iI}M%% zIs1grU@5Nv^q~|e7n1A!VYOcYCzGLU^B2ax7&E3Rcf={~8~FS(*|I+4pnDrqD0Rsx z9{9BSa2gIN%s?z``9fNTlrVhI)7tmW2u;+L^DMZNTOK95hoa9&hNSzosKOV55JNtb zbJ7ayG~#%fBdO}`T#IG+7f5CwRNZrXJbel%D69pYZo0laV{r*v|Lx+L6?K@Uwp$#<{7K7h{B?c&#KE; zF?@BY^+(vm8qX7+fNRTc@7fbl{lW4RV!VHEetrn>Fv%G%y1leDUK(Z*8b1H%{Bpsx ztxt*&KcE%#8s##~ZRXSCkR?63u%V5SfTgC0Xw3($bVtTWBdXOEE!?x4efl?rq~HH> zaM@gZcVbi)2-(dLCm{c}8Pl`3NU`R5Z-}?_=D}ab6qPmgPiu_6@l(kr?m7S7gh zEpJbMm!TDIDkIty_zsoO@jjN&F))StmEzcl$1B>mVAPHVt()NKyyt8~4_{``4o|J7 z1s2aKJ+r*)ZJ%o9>YRDrO{A9B>^Gz|_Rydsndf<|6oS`6!Z_)2pC%RvV&9$2*Is}? zVa=%QQ9nz&o8V;$!9Zy-&rEZ#g{LVOvxtU(`Un@#`Hk+6L9gADOb{rmz!x=b{yba& zz^T`>vYR4KetwO|h9TQuJ0W>K!@IHJ>%e^Zyt&)V zg(rXb7AK(xg(O|td?w@o2{Nsw_n00$4!Sz4p~>_NO}HL1W6J3MFl1t2Iz4%Tfsgzt zJfshF@XF*gDvu9f^#A3O%WLTiC+j$;r>d9tyez31(h|R(GeVB}2`Di^JAoU0-=Wt* zwpU>_88-kj{=^PuZ&4J04?mcl+v%=jDlL3I(~ZM1ou5>P#m9wM2!K?kpMY!fA`H<- z>Ep(d(c7%`*C^Zpl&M0x#2VHJ#2;mwMXP?9AhD_8s1SNNd8im|XN-I$eT863>f1Uk z7!r=eA6VlB)StET?QzEkJ?%FN-0bq@VKRK?136*s1^H|aHWG9iIrV?hQhM$GXw$#w z4gh%S_Al}S1poXOn*sng!~Uyo|NoI7U3+ zF2TV)djIj?@BafN`oCFj{@*!#`W3pZRXpp`Jc#?ZjHvv(Oo1afBBLL_9L=WQ{!KiR zq>(D1QlOEx9{TZ{-Os;|{x^Melisitc)WRG8++zMpsdhk2zXKySqrr7QRiQIr19is zt#H%+-70%kWyPW5<2wT@N_J=j|kZ-M{-J#bK~gYf?7N~alIdf0^S zl8wHZp@4DnHDO=|YgI0WB0Kr>on(p=8rnr5U8;Vkvv{cQi3(Y*l(es!VrJoXsLbmnkMo0A-I?y4ToB)jqwpSmVHJlzbR49u zEA|~+CWbjyKkVJ;96eHfoD2pnSvNG1-`5#@o@>O^o@7-Sb_o-w_sv!22zWU0+|Sr1 z(x9RY7-D*90KSxaA^;8y91*Cb@zig=G5jz-4uAG0dY|9v`%zbN|CE%A*GC0C#-w*X zh|jnBcA7#31Bck7%)&%z)oFPDNm*KNw9XsI66u!s_8a>Lem_IboPcFy`iX?fc>IAW z15K|T^aTgP)~6`is;M~(L! z;|=2l%HD8V=)nd+ES0PVZ+6qP2WJ{&)-(J|6WU(u`bTB3`J~=v1qW{ z)+Z-OLJRK;c6>ExzjyfEK5;O>vT3)qukU&PAy>==KHPKQII_I!tlwAwm@ptTD*!f6 zxI1yJx(=p9t4-hebq&E=YBj`K~a9SzJA zVk+t;@D>ForvK@<7&HYbe>05#wIpr!(_e zQXpeZz<|xA=1IS-`z<_+Zon1AEk6k|el-h$)d%%*jEZZu|y#=%COd|Buc~ODQd*k>Vtx!@CN<4EbnZ>;?BD^p< z_MP}(?h^PwyT0tZj8V0JO#;q)#ilLhOKr8>f=4T1Lo8%q(2jrDxJmQb{vch+hI(>+ zMU!%_Up&^WaF(3WGg^r|-3({G_@?Zb8u6Ba^r}==1v`;uj#H&AVIT8V%#!+ozej$8&te#!WvzjhXOO zsN758pmL_NwNZ9Z*VCm^T#rt04@rZ+uL!cSj9VkGW||CRloM3xsQQk5^_yFc6S7ZT z!QR}ugC79XSIEBTJq%os(4rF?1iQiy%GHA>N1QJNZBGq49~{(OH$)Q8_E&1AYPT)& z)DBi)F|6#o_mgnmi6d=5zF`T9*B2^%{@wbvtumfNf}@^t-gKMrfiKfZO+4pw1?rEf zB*CHG0=Y;#SVx*@Pr(w5+QZs5w!R)*KAv|ao?X(iPVtAigcZf975G*fP(=Zcb`$r^f6ttI1Wc0{G@;pRzhX18~00*HH$=*sa|E%eUWGPp;ew3 z#es_Oz@I41v>^_&#E1yt56nVluurA;8ECTjpT%*bDh}_*?yt)Sj{O)__e4E^4b%}> zd?7jfnvz4^C&t;&RviGoF#sNpOngS=wjm^>is5Vu-1~NSUNMqiF4j5E%S=cyv!i`j z&&A{)z7IR1(9D%QV|hc@T7Il3(&aL;^WZr6-OHMGi5b%$n-WwJFkRYa!r|8)@Z8Op zNM{Brx!lAqkI_55ohh}zbBkrcHKFK}v{*Kwez;Xj!<8{hazmvC<%w0^%SxMHd6W9h zv#O2>hOV6qA7*S(aDgv@vev&{!qPNjfK3c@FF?x0_xWbuj?z)<23meiAg3udeg}`2 zG@Ac}Yoq2Hl?+;H-F*#F1?G{#zS(|Ick<^VxCNg&-{vjj$$Uu&0dP~`T#$GP?Vte8 zIPnEeA(QO+)%XjolD>X_>X#~npfA<O&C!qOzw3RcHa4S`zyDnR~$Wg@eIhX_O|kpie{FvUcp$U59q{(M(Ah-Gkal!oh?2 zB;wu$sd}T!&jA`BZi{I7`*Jh%CcO97jjAN=n6%hy2V0!=W;XhPAIzl9L{a#Enz2rv z-c)0(_O)Q=vAKxOlU{9SR;b~CjmC!XbFKT!$Hr2x;RVGm1}ljOba&DWEOE~U0X2!av)`3gH1eL|-P#KdGXCE@*2h5{Fm>m`oAgGtOH$M^jcjqraM_?CtwDh;?Iu!0XC4&Fd@N(?6n$!2oAE6pev_nwd83S@Dr{C<(BZpj|f z=O3_wB}%Fbeaays{S~UD z=HyFW-PjX{Jv;~zm9@QSBD{50gV~%UccC^mKL6+w zIy*RlbQKw__pXPb+|WP+I%8YUZD{Zp*xx4lui5oM?ESy)YPTH>?&C+ESO0kqVEVtE z(Es&H|M!LeV_V{X^^0q=J2hM-m>!(u6HadtK5krOQWsn(8uJgte|`J}YuoJbGyti+zr0**9sZDV|S^N8@ih|~w3c0r*|1X%i-8KLK literal 0 HcmV?d00001 diff --git a/tools/speech_data_explorer/data_explorer.py b/tools/speech_data_explorer/data_explorer.py index 23ea375fa608..de2b342a1028 100755 --- a/tools/speech_data_explorer/data_explorer.py +++ b/tools/speech_data_explorer/data_explorer.py @@ -59,6 +59,7 @@ 'contains ': 'contains', } comparison_mode = False + # parse table filter queries def split_filter_part(filter_part): for op in filter_operators: @@ -125,6 +126,7 @@ def parse_args(): # automaticly going in comparison mode, if there is names_compared argument if args.names_compared is not None: comparison_mode = True + logging.error("comparison mod set to true") else: comparison_mode = False @@ -549,6 +551,7 @@ def absolute_audio_filepath(audio_filepath, audio_base_path): name_1, name_2 = args.names_compared print(name_1, name_2) + print('Loading data...') if not comparison_mode: data, wer, cer, wmr, mwa, num_hours, vocabulary, alphabet, metrics_available = load_data( @@ -930,8 +933,7 @@ def update_wordstable(page_current, sort_by, filter_query): wordstable_columns_tool = [{'name': 'Word', 'id': 'word'}, {'name': 'Count', 'id': 'count'}] wordstable_columns_tool.append({'name': 'Accuracy_1, %', 'id': 'accuracy_1'}) wordstable_columns_tool.append({'name': 'Accuracy_2, %', 'id': 'accuracy_2'}) -# wordstable_columns_tool.append({'name': 'Accuracy_' + name_1 + ', %', 'id': 'accuracy_1'}) -# wordstable_columns_tool.append({'name': 'Accuracy_' + name_2 + ', %', 'id': 'accuracy_2'}) + if comparison_mode: model_name_1, model_name_2 = name_1, name_2 @@ -939,6 +941,47 @@ def update_wordstable(page_current, sort_by, filter_query): for i in range(len(vocabulary_1)): vocabulary_1[i].update(vocabulary_2[i]) + def _wer_(grnd, pred): + grnd_words = grnd.split() + pred_words = pred.split() + edit_distance = editdistance.eval(grnd_words, pred_words) + wer = edit_distance / len(grnd_words) + return wer + + def metric(a, b, met=None): + cer = editdistance.distance(a, b) / len(a) + wer = _wer_(a, b) + return round(float(wer) * 100, 2), round(float(cer) * 100, 2) + + def write_metrics(data, Ox, Oy): + da = pd.DataFrame.from_records(data) + gt = da['text'] + tt_1 = da[Ox] + tt_2 = da[Oy] + + wer_tt1_c, cer_tt1_c = [], [] + wer_tt2_c, cer_tt2_c = [], [] + + for j in range(len(gt)): + wer_tt1, cer_tt1 = metric(gt[j], tt_1[j]) # first model + wer_tt2, cer_tt2 = metric(gt[j], tt_2[j]) # second model + wer_tt1_c.append(wer_tt1) + cer_tt1_c.append(cer_tt1) + wer_tt2_c.append(wer_tt2) + cer_tt2_c.append(cer_tt2) + + da['wer_' + Ox] = pd.Series(wer_tt1_c, index=da.index) + da['wer_' + Oy] = pd.Series(wer_tt2_c, index=da.index) + da['cer_' + Ox] = pd.Series(cer_tt1_c, index=da.index) + da['cer_' + Oy] = pd.Series(cer_tt2_c, index=da.index) + return da.to_dict('records') + + data_with_metrics = write_metrics(data, model_name_1, model_name_2) + if args.show_statistics is not None: + textdiffstyle = {'border': 'none', 'width': '100%', 'height': '100%'} + else: + textdiffstyle = {'border': 'none', 'width': '1%', 'height': '1%', 'display': 'none'} + def prepare_data(df, name1=model_name_1, name2=model_name_2): res = pd.DataFrame() tmp = df['word'] @@ -1050,6 +1093,33 @@ def read_query(query): return "No filter query" return dcc.Markdown('`filter_query = "{}"`'.format(query)) + ############ + @app.callback( + Output('filter-query-input-2', 'style'), + Output('filter-query-output-2', 'style'), + Input('filter-query-read-write', 'value'), + ) + def query_input_output(val): + input_style = {'width': '100%'} + output_style = {} + input_style.update(display='inline-block') + output_style.update(display='none') + return input_style, output_style + + @app.callback(Output('datatable-advanced-filtering-2', 'filter_query'), Input('filter-query-input-2', 'value')) + def write_query(query): + if query is None: + return '' + return query + + @app.callback(Output('filter-query-output-2', 'children'), Input('datatable-advanced-filtering-2', 'filter_query')) + def read_query(query): + if query is None: + return "No filter query" + return dcc.Markdown('`filter_query = "{}"`'.format(query)) + + ############ + def display_query(query): if query is None: return '' @@ -1068,50 +1138,325 @@ def display_query(query): ] ) - comparison_layout = [ - html.Div( - [dcc.Markdown("model 1:" + ' ' + model_name_1[10:]), dcc.Markdown("model 2:" + ' ' + model_name_2[10:])] - ), - html.Hr(), + +comparison_layout = [ + html.Div( + [ + dcc.Markdown("model 1:" + ' ' + model_name_1[10:]), + dcc.Markdown("model 2:" + ' ' + model_name_2[10:]), + dcc.Dropdown( + ['word level', 'utterance level'], 'word level', placeholder="choose comparison lvl", id='lvl_choose' + ), + ] + ), + html.Hr(), + html.Div( + [ + html.Div( + [ + dcc.Dropdown(for_col_names.columns[::], 'accuracy_model_' + model_name_1, id='xaxis-column'), + dcc.Dropdown(for_col_names.columns[::], 'accuracy_model_' + model_name_2, id='yaxis-column'), + dcc.Dropdown( + for_col_names.select_dtypes(include='number').columns[::], + placeholder='Select what will encode color of points', + id='color-column', + ), + dcc.Dropdown( + for_col_names.select_dtypes(include='number').columns[::], + placeholder='Select what will encode size of points', + id='size-column', + ), + dcc.Dropdown( + ['yes', 'no'], + placeholder='if you want to enable dot spacing', + id='dot_spacing', + style={'width': '200%'}, + ), + dcc.Input(id='radius', placeholder='Enter radius of spacing (std is 0.01)'), + html.Hr(), + dcc.Input(id='filter-query-input', placeholder='Enter filter query',), + ], + style={'width': '200%', 'display': 'inline-block', 'float': 'middle'}, + ), + html.Hr(), + html.Div(id='filter-query-output'), + dash_table.DataTable( + id='datatable-advanced-filtering', + columns=wordstable_columns_tool, + data=vocabulary_1, + editable=False, + page_action='native', + page_size=5, + filter_action="native", + ), + html.Hr(), + html.Div(id='datatable-query-structure', style={'whitespace': 'pre'}), + html.Hr(), + dbc.Row(dbc.Col(dcc.Graph(id='voc_graph'),),), + html.Hr(), + ], + id='wrd_lvl', + style={'display': 'block'}, + ), + html.Div( + [ + html.Div( + [ + dcc.Dropdown(['WER', 'CER'], 'WER', placeholder="Choose metric", id="choose_metric"), + dbc.Row(dbc.Col(html.H5('Data'), class_name='text-secondary'), class_name='mt-3'), + html.Hr(), + html.Hr(), + dcc.Input(id='filter-query-input-2', placeholder='Enter filter query', style={'width': '100%'}), + html.Div(id='filter-query-output-2'), + dbc.Row( + dbc.Col( + [ + dash_table.DataTable( + id='datatable-advanced-filtering-2', + columns=[ + {'name': k.replace('_', ' '), 'id': k, 'hideable': True} + for k in data_with_metrics[0] + ], + data=data_with_metrics, + editable=False, + page_action='native', + page_size=5, + row_selectable='single', + selected_rows=[0], + page_current=0, + filter_action="native", + style_cell={ + 'overflow': 'hidden', + 'textOverflow': 'ellipsis', + 'maxWidth': 0, + 'textAlign': 'center', + }, + style_header={ + 'color': 'text-primary', + 'text_align': 'center', + 'height': 'auto', + 'whiteSpace': 'normal', + }, + css=[ + { + 'selector': '.dash-spreadsheet-menu', + 'rule': 'position:absolute; bottom: 8px', + }, + {'selector': '.dash-filter--case', 'rule': 'display: none'}, + {'selector': '.column-header--hide', 'rule': 'display: none'}, + ], + ), + dbc.Row(dbc.Col(html.Audio(id='player-1', controls=True),), class_name='mt-3'), + ] + ) + ), + ] + + [ + dbc.Row( + [ + dbc.Col( + html.Div(children=k.replace('_', '-')), + width=2, + class_name='mt-1 bg-light font-monospace text-break small rounded border', + ), + dbc.Col( + html.Div(id='__' + k), + class_name='mt-1 bg-light font-monospace text-break small rounded border', + ), + ] + ) + for k in data_with_metrics[0] + ] + ), + ], + id='unt_lvl', + ), +] + +if args.show_statistics is not None: + comparison_layout += [ html.Div( [ - dcc.Dropdown(for_col_names.columns[::], 'accuracy_model_' + model_name_1, id='xaxis-column'), - dcc.Dropdown(for_col_names.columns[::], 'accuracy_model_' + model_name_2, id='yaxis-column'), - dcc.Dropdown( - for_col_names.select_dtypes(include='number').columns[::], - placeholder='Select what will encode color of points', - id='color-column', - ), - dcc.Dropdown( - for_col_names.select_dtypes(include='number').columns[::], - placeholder='Select what will encode size of points', - id='size-column', - ), - dcc.Dropdown(['yes', 'no'], placeholder='if you want to enable dot spacing', id='dot_spacing'), - dcc.Input(id='radius', placeholder='Enter radius of spacing (std is 0.01)'), - html.Hr(), - dcc.Input(id='filter-query-input', placeholder='Enter filter query'), + dbc.Row( + [ + dbc.Col( + html.Div(children='text diff'), + width=2, + class_name='mt-1 bg-light font-monospace text-break small rounded border', + ), + dbc.Col( + html.Iframe( + id='__diff', + sandbox='', + srcDoc='', + style=textdiffstyle, + className='bg-light font-monospace text-break small', + ), + class_name='mt-1 bg-light font-monospace text-break small rounded border', + ), + ], + id="text_diff_div", + ) ], - style={'width': '50%', 'display': 'inline-block', 'float': 'middle'}, - ), - html.Hr(), - html.Div(id='filter-query-output'), - dash_table.DataTable( - id='datatable-advanced-filtering', - columns=wordstable_columns_tool, - data=vocabulary_1, - editable=False, - page_action='native', - page_size=5, - filter_action="native", + id='mid_thing', + style={'display': 'block'}, ), - html.Hr(), - html.Div(id='datatable-query-structure', style={'whitespace': 'pre'}), - html.Hr(), - dbc.Row(dbc.Col(dcc.Graph(id='voc_graph'),),), - html.Hr(), ] + @app.callback( + [ + Output(component_id='wrd_lvl', component_property='style'), + Output(component_id='unt_lvl', component_property='style'), + Output(component_id='mid_thing', component_property='style'), + Output(component_id='down_thing', component_property='style'), + Input(component_id='lvl_choose', component_property='value'), + ] + ) + def show_hide_element(visibility_state): + if visibility_state == 'word level': + return ( + {'width': '50%', 'display': 'inline-block', 'float': 'middle'}, + {'width': '50%', 'display': 'none', 'float': 'middle'}, + {'display': 'none'}, + {'display': 'none'}, + ) + else: + return ( + {'width': '100%', 'display': 'none', 'float': 'middle'}, + {'width': '100%', 'display': 'inline-block', 'float': 'middle'}, + {'display': 'block'}, + {'display': 'block'}, + ) + + +comparison_layout += [ + html.Div( + [ + html.Div( + [ + dbc.Row(dbc.Col(dcc.Graph(id='utt_graph'),),), + html.Hr(), + dcc.Input(id='clicked_aidopath', style={'width': '100%'}), + html.Hr(), + dcc.Input(id='my-output-1', style={'display': 'none'}), # we do need this + ] + ), + html.Div([dbc.Row(dbc.Col(dcc.Graph(id='signal-graph-1')), class_name='mt-3'),]), + ], + id='down_thing', + style={'display': 'block'}, + ) +] + + +if args.show_statistics is None: + + @app.callback( + [ + Output(component_id='wrd_lvl', component_property='style'), + Output(component_id='unt_lvl', component_property='style'), + Output(component_id='down_thing', component_property='style'), + Input(component_id='lvl_choose', component_property='value'), + ] + ) + def show_hide_element(visibility_state): + if args.show_statistics is not None: + a = {'border': 'none', 'width': '100%', 'height': '100%', 'display': 'block'} + else: + a = {'border': 'none', 'width': '100%', 'height': '100%', 'display': 'none'} + if visibility_state == 'word level': + return ( + {'width': '50%', 'display': 'inline-block', 'float': 'middle'}, + {'width': '50%', 'display': 'none', 'float': 'middle'}, + {'display': 'none'}, + ) + else: + return ( + {'width': '100%', 'display': 'none', 'float': 'middle'}, + {'width': '100%', 'display': 'inline-block', 'float': 'middle'}, + {'display': 'block'}, + ) + + +store = [] + + +@app.callback( + [Output('datatable-advanced-filtering-2', 'page_current'), Output('my-output-1', 'value')], + [Input('utt_graph', 'clickData'),], +) +def real_select_click(hoverData): + if hoverData is not None: + path = str(hoverData['points'][0]['customdata'][-1]) + for t in range(len(data_with_metrics)): + if data_with_metrics[t]['audio_filepath'] == path: + ind = t + s = t #% 5 + sel = s + pg = math.ceil(ind // 5) + return pg, sel + else: + return 0, 0 + + +@app.callback( + [Output('datatable-advanced-filtering-2', 'selected_rows')], [Input('my-output-1', 'value')], +) +def real_select_click(num): + s = num + return [[s]] + + +CALCULATED_METRIC = [False, False] + + +@app.callback( + [ + Output('utt_graph', 'figure'), + Output('clicked_aidopath', 'value'), + Input('choose_metric', 'value'), + Input('utt_graph', 'clickData'), + Input('datatable-advanced-filtering-2', 'derived_virtual_data'), + ], +) +def draw_table_with_metrics(met, hoverData, data_virt): + Ox = name_1 + Oy = name_2 + if met == "WER": + cerower = 'wer_' + else: + cerower = 'cer_' + da = pd.DataFrame.from_records(data_virt) + + c = da + fig = px.scatter( + c, + x=cerower + Ox, + y=cerower + Oy, + width=1000, + height=900, + color='num_words', + hover_data={ + 'text': True, + Ox: True, + Oy: True, + 'wer_' + Ox: True, + 'wer_' + Oy: True, + 'cer_' + Ox: True, + 'cer_' + Oy: True, + 'audio_filepath': True, + }, + ) #'numwords': True, + fig.add_shape(type="line", x0=0, y0=0, x1=100, y1=100, line=dict(color="Red", width=1, dash="dot",)) + fig.update_layout(clickmode='event+select') + fig.update_traces(marker_size=10) + path = None + + if hoverData is not None: + path = str(hoverData['points'][0]['customdata'][-1]) + + return fig, path + @app.callback( [Output('datatable', 'data'), Output('datatable', 'page_count')], @@ -1219,6 +1564,18 @@ def show_item(idx, data): return [data[idx[0]][k] for k in data[0]] +if comparison_mode: + + @app.callback( + [Output('__' + k, 'children') for k in data_with_metrics[0]], + [Input('datatable-advanced-filtering-2', 'selected_rows'), Input('datatable-advanced-filtering-2', 'data')], + ) + def show_item(idx, data): + if len(idx) == 0: + raise PreventUpdate + return [data[idx[0]][k] for k in data_with_metrics[0]] + + @app.callback(Output('_diff', 'srcDoc'), [Input('datatable', 'selected_rows'), Input('datatable', 'data'),]) def show_diff( idx, data, @@ -1245,6 +1602,35 @@ def show_diff( return diff_html +@app.callback( + Output('__diff', 'srcDoc'), + [Input('datatable-advanced-filtering-2', 'selected_rows'), Input('datatable-advanced-filtering-2', 'data'),], +) +def show_diff( + idx, data, +): + if len(idx) == 0: + raise PreventUpdate + orig_words = data[idx[0]]['text'] + orig_words = '\n'.join(orig_words.split()) + '\n' + + pred_words = data[idx[0]][fld_nm] + pred_words = '\n'.join(pred_words.split()) + '\n' + + diff = diff_match_patch.diff_match_patch() + diff.Diff_Timeout = 0 + orig_enc, pred_enc, enc = diff.diff_linesToChars(orig_words, pred_words) + diffs = diff.diff_main(orig_enc, pred_enc, False) + diff.diff_charsToLines(diffs, enc) + diffs_post = [] + for d in diffs: + diffs_post.append((d[0], d[1].replace('\n', ' '))) + + diff_html = diff.diff_prettyHtml(diffs_post) + + return diff_html + + @app.callback(Output('signal-graph', 'figure'), [Input('datatable', 'selected_rows'), Input('datatable', 'data')]) def plot_signal(idx, data): if len(idx) == 0: @@ -1298,6 +1684,62 @@ def plot_signal(idx, data): return figs +@app.callback( + Output('signal-graph-1', 'figure'), + [Input('datatable-advanced-filtering-2', 'selected_rows'), Input('datatable-advanced-filtering-2', 'data')], +) +def plot_signal(idx, data): + if len(idx) == 0: + raise PreventUpdate + figs = make_subplots(rows=2, cols=1, subplot_titles=('Waveform', 'Spectrogram')) + try: + filename = absolute_audio_filepath(data[idx[0]]['audio_filepath'], args.audio_base_path) + audio, fs = librosa.load(path=filename, sr=None) + if 'offset' in data[idx[0]]: + audio = audio[ + int(data[idx[0]]['offset'] * fs) : int((data[idx[0]]['offset'] + data[idx[0]]['duration']) * fs) + ] + time_stride = 0.01 + hop_length = int(fs * time_stride) + n_fft = 512 + # linear scale spectrogram + s = librosa.stft(y=audio, n_fft=n_fft, hop_length=hop_length) + s_db = librosa.power_to_db(S=np.abs(s) ** 2, ref=np.max, top_db=100) + figs.add_trace( + go.Scatter( + x=np.arange(audio.shape[0]) / fs, + y=audio, + line={'color': 'green'}, + name='Waveform', + hovertemplate='Time: %{x:.2f} s
Amplitude: %{y:.2f}
', + ), + row=1, + col=1, + ) + figs.add_trace( + go.Heatmap( + z=s_db, + colorscale=[[0, 'rgb(30,62,62)'], [0.5, 'rgb(30,128,128)'], [1, 'rgb(30,255,30)'],], + colorbar=dict(yanchor='middle', lenmode='fraction', y=0.2, len=0.5, ticksuffix=' dB'), + dx=time_stride, + dy=fs / n_fft / 1000, + name='Spectrogram', + hovertemplate='Time: %{x:.2f} s
Frequency: %{y:.2f} kHz
Magnitude: %{z:.2f} dB', + ), + row=2, + col=1, + ) + figs.update_layout({'margin': dict(l=0, r=0, t=20, b=0, pad=0), 'height': 500}) + figs.update_xaxes(title_text='Time, s', row=1, col=1) + figs.update_yaxes(title_text='Amplitude', row=1, col=1) + figs.update_xaxes(title_text='Time, s', row=2, col=1) + figs.update_yaxes(title_text='Frequency, kHz', row=2, col=1) + except Exception as ex: + app.logger.error(f'ERROR in plot signal: {ex}') + + return figs + + @app.callback(Output('player', 'src'), [Input('datatable', 'selected_rows'), Input('datatable', 'data')]) def update_player(idx, data): if len(idx) == 0: @@ -1320,5 +1762,30 @@ def update_player(idx, data): return '' +@app.callback( + Output('player-1', 'src'), + [Input('datatable-advanced-filtering-2', 'selected_rows'), Input('datatable-advanced-filtering-2', 'data')], +) +def update_player(idx, data): + if len(idx) == 0: + raise PreventUpdate + try: + filename = absolute_audio_filepath(data[idx[0]]['audio_filepath'], args.audio_base_path) + signal, sr = librosa.load(path=filename, sr=None) + if 'offset' in data[idx[0]]: + signal = signal[ + int(data[idx[0]]['offset'] * sr) : int((data[idx[0]]['offset'] + data[idx[0]]['duration']) * sr) + ] + with io.BytesIO() as buf: + # convert to PCM .wav + sf.write(buf, signal, sr, format='WAV') + buf.seek(0) + encoded = base64.b64encode(buf.read()) + return 'data:audio/wav;base64,{}'.format(encoded.decode()) + except Exception as ex: + app.logger.error(f'ERROR in audio player: {ex}') + return '' + + if __name__ == '__main__': app.run_server(host='0.0.0.0', port=args.port, debug=args.debug) From 5c3ed943d8b23ae32719f8fd0e51fca5801596d3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 3 Jun 2023 22:52:03 -0700 Subject: [PATCH 013/123] Debug Transformer Engine FP8 support with Megatron-core infrastructure (#6791) * Construct FP8 amax reduction group Signed-off-by: Tim Moon * Update Megatron-core version in CI Signed-off-by: Tim Moon --------- Signed-off-by: Tim Moon Co-authored-by: Tim Moon Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> --- Jenkinsfile | 9 +++++++++ nemo/collections/nlp/parts/nlp_overrides.py | 3 ++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 79c696a48600..bc991d195710 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -57,6 +57,15 @@ pipeline { } } + stage('Megatron Core installation') { + steps { + sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ + cd Megatron-LM && \ + git checkout cd2537d444792b487b1ab5a6fa685e09c9957409 && \ + pip install -e .' + } + } + stage('PyTorch Lightning version') { steps { sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"' diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index 5a0f028ddbe9..c390ba995843 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -168,6 +168,7 @@ def init_model_parallel(self, global_rank: int, world_size: int) -> None: pipeline_model_parallel_size=app_state.pipeline_model_parallel_size, virtual_pipeline_model_parallel_size=app_state.virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank, + use_fp8=app_state.use_fp8, ) # assert that fake tp and pp rank match after model parallel init @@ -406,7 +407,7 @@ class PEFTSaveRestoreConnector(NLPSaveRestoreConnector): peft_model_nemo_path: Used to provide the .nemo file corresponding to a PEFT model (which will only contain a small set of params) peft_model_ckpt_path: Used to provide the path to .ckpt files of a PEFT model. This is required when no .nemo is available (yet) such as during resumed training. peft_model_ckpt_name: The filename of the ckpt file inside the peft_model_ckpt_path folder - If both are provided the peft_model_ckpt_path takes precedence. + If both are provided the peft_model_ckpt_path takes precedence. If neither are provided, PEFT params are initialized at random (not loaded from any external source). """ From ae004147b50a9d963e4cbe5da6ba75407b0aef5f Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Sun, 4 Jun 2023 21:17:19 -0700 Subject: [PATCH 014/123] Lora/PEFT training script CI test (#6664) * new lora test Signed-off-by: arendu * updates Signed-off-by: arendu * check for chat Signed-off-by: arendu * update Signed-off-by: arendu * update Signed-off-by: arendu * small train set Signed-off-by: arendu * update Signed-off-by: arendu * precision change Signed-off-by: arendu * fixed typo in paths Signed-off-by: arendu * full data with limit val batches Signed-off-by: arendu * tp2 instead of pp2 Signed-off-by: arendu * tp2 instead of pp2 Signed-off-by: arendu --------- Signed-off-by: arendu Signed-off-by: Adi Renduchintala --- Jenkinsfile | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index bc991d195710..83223d5c8669 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -3381,6 +3381,41 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' sh "rm -rf examples/nlp/language_modeling/gpt_sft_results" } } + stage('L2: Megatron GPT PEFT Lora TP=2') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + steps { + sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results" + sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.max_epochs=9999 \ + trainer.max_steps=3 \ + trainer.val_check_interval=3 \ + ++trainer.limit_val_batches=2 \ + trainer.precision=16 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results \ + model.pipeline_model_parallel_size=1 \ + model.tensor_model_parallel_size=2 \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + model.peft.peft_scheme='lora' \ + model.answer_only_loss=True \ + model.micro_batch_size=1 \ + model.global_batch_size=4 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.train_ds.concat_sampling_probabilities=[1.0] \ + model.data.train_ds.num_workers=0 \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.names=[quarel]" + sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results" + } + } stage('L2: Megatron GPT Eval') { when { anyOf { From 76fc488a6584077c85f9a06d81bfd9f73a65e73a Mon Sep 17 00:00:00 2001 From: bene-ges Date: Mon, 5 Jun 2023 08:29:09 +0300 Subject: [PATCH 015/123] change branch to main, small fix (#6803) Signed-off-by: Alexandra Antonova --- .../nlp/SpellMapper_English_ASR_Customization.ipynb | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb index 189ac958d377..e11025aeb1d3 100644 --- a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb +++ b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb @@ -83,8 +83,8 @@ "source": [ "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines\n", "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n", - "GITHUB_ACCOUNT = \"bene-ges\"\n", - "BRANCH = \"spellchecking_asr_customization_double_bert\"\n", + "GITHUB_ACCOUNT = \"NVIDIA\"\n", + "BRANCH = \"main\"\n", "!python -m pip install git+https://github.com/{GITHUB_ACCOUNT}/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]\n", "\n", "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", @@ -974,7 +974,8 @@ "metadata": { "id": "ZwEpAOCaRH7s" }, - "outputs": [] + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -1387,9 +1388,9 @@ "accelerator": "GPU", "colab": { "toc_visible": true, - "provenance": [] + "provenance": [], + "gpuType": "T4" }, - "gpuClass": "standard", "kernelspec": { "display_name": "Python 3", "name": "python3" From aa1986f05960ccd7260f262185b0353811105b3c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 5 Jun 2023 10:19:28 -0700 Subject: [PATCH 016/123] add call to p2p overlap (#6779) (#6786) * add call to p2p overlap * update Jenkins for test --------- Signed-off-by: Abhinav Khattar Signed-off-by: Eric Harper Co-authored-by: Abhinav Khattar Co-authored-by: Eric Harper --- Jenkinsfile | 2 +- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 83223d5c8669..fdd311ba4a59 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -61,7 +61,7 @@ pipeline { steps { sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout cd2537d444792b487b1ab5a6fa685e09c9957409 && \ + git checkout e6d7e09845590d0a36bc7f29eb28db974fb8da4e && \ pip install -e .' } } diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index fd1382e668cf..96f40b99bdd0 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -468,6 +468,8 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): no_sync_func=no_sync_func, grad_sync_func=grad_sync_func, param_sync_func=param_sync_func, + overlap_p2p_comm=self.cfg.get('overlap_p2p_comm', False), + batch_p2p_comm=self.cfg.get('batch_p2p_comm', True), ) # only the last stages of the pipeline return losses From aa21e8aa91fe893b5c460200b74c928ed918e28a Mon Sep 17 00:00:00 2001 From: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Date: Mon, 5 Jun 2023 11:44:09 -0700 Subject: [PATCH 017/123] fixed decor to show messages only when the wrapped object is called. (#6793) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --- nemo/utils/decorators/experimental.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/nemo/utils/decorators/experimental.py b/nemo/utils/decorators/experimental.py index 35b26fb8690d..de62dbaf9ffb 100644 --- a/nemo/utils/decorators/experimental.py +++ b/nemo/utils/decorators/experimental.py @@ -15,19 +15,13 @@ __all__ = ['experimental'] -from nemo.utils import logging - -def experimental(cls): - """ Decorator which indicates that module is experimental. - Use it to mark experimental or research modules. - """ +import wrapt - def wrapped(cls): - logging.warning( - f'Module {cls} is experimental, not ready for production and is not fully supported. Use at your own risk.' - ) +from nemo.utils import logging - return cls - return wrapped(cls=cls) +@wrapt.decorator +def experimental(wrapped, instance, args, kwargs): + logging.warning(f"`{wrapped}` is experimental and not ready for production yet. Use at your own risk.") + return wrapped(*args, **kwargs) From f9bb1b0f8272f42cf5f64174ec1e51ebfcc1fe32 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 5 Jun 2023 12:31:14 -0700 Subject: [PATCH 018/123] Bug fix for reset_sequence_parallel_args (#6802) (#6805) Signed-off-by: Markel Sanz Ausin Co-authored-by: Markel Sanz Ausin --- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 96f40b99bdd0..9aadb6853190 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1282,7 +1282,7 @@ def _reset_sequence_parallelism_args(self): for module in self.get_gpt_module_list(): for mod in module.modules(): if hasattr(mod, "sequence_parallel"): - mod.sequence_parallel = self.last_sequence_parallel + mod.sequence_parallel = False def _restore_sequence_parallelism_args(self): """ Restores the sequence parallelism parameters using the values saved by From 3063e3251bb4dbbf81278084cce132c3e56b4c52 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Tue, 6 Jun 2023 01:52:45 -0400 Subject: [PATCH 019/123] text_generation_utils memory reduction if no logprob needed (#6773) * repro for gpt eval mp mem issue Signed-off-by: Yang Zhang * add print statements for memory allocation Signed-off-by: Yang Zhang * adjusted hot fix that prevents softmax on the entire output embedding,now memory bottlenecked by attention softmax which needs to be solved with FA or long attention Signed-off-by: Yang Zhang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * using compute_logprob to configure inference Signed-off-by: Yang Zhang * enable compute logprob for peft Signed-off-by: Yang Zhang * remove print statements Signed-off-by: Yang Zhang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix ci Signed-off-by: Yang Zhang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added docstrings Signed-off-by: Yang Zhang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add missing config Signed-off-by: Yang Zhang * remove truncate prompt length feature Signed-off-by: Yang Zhang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tensor before all gather needs to be contiguous Signed-off-by: Yang Zhang --------- Signed-off-by: Yang Zhang Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> Co-authored-by: Sandeep Subramanian --- .../tuning/megatron_gpt_peft_eval.py | 22 ++-- .../language_modeling/megatron_gpt_model.py | 2 - .../megatron_gpt_sft_model.py | 4 +- .../megatron_retrieval_model.py | 2 - .../common/text_generation_strategy.py | 1 - .../modules/common/text_generation_utils.py | 110 +++++++++++------- 6 files changed, 83 insertions(+), 58 deletions(-) diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py index a5bf1ee552cb..fc427a60d172 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py @@ -155,7 +155,7 @@ def main(cfg) -> None: if os.path.isdir(cfg.model.restore_from_path): save_restore_connector.model_extracted_dir = cfg.model.restore_from_path - model = NLPModel.restore_from( + model = MegatronGPTSFTModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=peft_model_cfg, @@ -180,15 +180,17 @@ def main(cfg) -> None: for batch in response: batch_sentences = [s for s in batch['sentences']] batch_tokens = [s for s in batch['tokens']] - batch_logprob = [s.tolist() for s in batch['logprob']] - for s, t, l in zip(batch_sentences, batch_tokens, batch_logprob): - if cfg.inference.get("verbose", False): - d = { - 'sentence': s, - 'tokens_with_logprobs': ', '.join([f"{_t} {_l:.4f}" for _t, _l in zip(t, l)]), - } - f.write(json.dumps(d, sort_keys=True, indent=2) + '\n') - else: + if cfg.inference.compute_logprob: + batch_logprob = [s.tolist() for s in batch['logprob']] + for s, t, l in zip(batch_sentences, batch_tokens, batch_logprob): + if cfg.inference.get("verbose", False): + d = { + 'sentence': s, + 'tokens_with_logprobs': ', '.join([f"{_t} {_l:.4f}" for _t, _l in zip(t, l)]), + } + f.write(json.dumps(d, sort_keys=True, indent=2) + '\n') + else: + for s in batch_sentences: d = {'sentence': s} f.write(json.dumps(d) + '\n') print("predictions saved to {}".format(cfg.inference.outfile_path)) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 9aadb6853190..3530ffcfc371 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1111,7 +1111,6 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] inference_config = inference_config.copy() compute_logprob = inference_config['compute_logprob'] if compute_logprob: - del inference_config['compute_logprob'] inference_config['inputs'] = batch inference_config['tokens_to_generate'] = 1 inference_config['all_probs'] = True @@ -1121,7 +1120,6 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] compute_prob_response = get_computeprob_response(self.tokenizer, response, batch) return compute_prob_response else: - del inference_config['compute_logprob'] inference_config['inputs'] = batch return generate(self, **inference_config) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index 1dc335b86609..9507a01d01f0 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -35,6 +35,7 @@ LengthParam, SamplingParam, generate, + get_computeprob_response, megatron_gpt_generate, ) from nemo.utils import AppState, logging @@ -539,7 +540,6 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] inference_config = inference_config.copy() compute_logprob = inference_config['compute_logprob'] if compute_logprob: - del inference_config['compute_logprob'] inference_config['inputs'] = batch inference_config['tokens_to_generate'] = 1 inference_config['all_probs'] = True @@ -549,8 +549,6 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] compute_prob_response = get_computeprob_response(self.tokenizer, response, batch) return compute_prob_response else: - del inference_config['compute_logprob'] - # for megatron_gpt_eval.py if isinstance(batch, list): inference_config['inputs'] = batch diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py index afd8ad54d150..5900513f3547 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py @@ -464,7 +464,6 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] inference_config = inference_config.copy() compute_logprob = inference_config['compute_logprob'] if compute_logprob: - del inference_config['compute_logprob'] inference_config['inputs'] = batch inference_config['tokens_to_generate'] = 1 inference_config['all_probs'] = True @@ -474,7 +473,6 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] compute_prob_response = get_computeprob_response(self.tokenizer, response, batch) return compute_prob_response else: - del inference_config['compute_logprob'] inference_config['inputs'] = batch return generate(self, **inference_config, strategy=self.inference_strategy) diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index 27ae3b2606d3..310065fc3523 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -53,7 +53,6 @@ def __init__(self, model): def forward_step(self, batch, tensor_shape): fwd_bwd_function = get_forward_backward_func() - output_tensor = fwd_bwd_function( forward_step_func=self.model.get_forward_output_only_func(), data_iterator=iter([batch,]), diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py index 3a07a807b11a..a56304970bdc 100644 --- a/nemo/collections/nlp/modules/common/text_generation_utils.py +++ b/nemo/collections/nlp/modules/common/text_generation_utils.py @@ -97,6 +97,7 @@ def megatron_gpt_generate(model, inputs, tokenizer, length_params, sampling_para inputs=inputs, tokens_to_generate=length_params['max_length'], all_probs=sampling_params['all_probs'], + compute_logprob=sampling_params['compute_logprob'], temperature=sampling_params['temperature'], add_BOS=sampling_params['add_BOS'], top_k=sampling_params['top_k'], @@ -116,6 +117,7 @@ def megatron_gpt_generate(model, inputs, tokenizer, length_params, sampling_para inputs=inputs, tokens_to_generate=length_params['max_length'], all_probs=sampling_params['all_probs'], + compute_logprob=sampling_params['compute_logprob'], temperature=sampling_params['temperature'], add_BOS=sampling_params['add_BOS'], top_k=sampling_params['top_k'], @@ -269,6 +271,7 @@ def send_generate_info( context_length_tensor, tokens_to_generate, all_probs, + compute_logprob, temperature, top_k, top_p, @@ -288,6 +291,7 @@ def send_generate_info( context_tokens_tensor.size(1), # seq_len tokens_to_generate, all_probs, + compute_logprob, # whether to compute log probabilities matrix temperature, top_k, top_p, @@ -317,18 +321,19 @@ def receive_generate_info(): """ model_parallel_group = parallel_state.get_model_parallel_group() src = get_model_parallel_src_rank() - input_info_tensor = torch.empty(10, dtype=torch.float32, device=torch.cuda.current_device()) + input_info_tensor = torch.empty(11, dtype=torch.float32, device=torch.cuda.current_device()) torch.distributed.broadcast(input_info_tensor, src, model_parallel_group) batch_size = int(input_info_tensor[0].item()) seq_len = int(input_info_tensor[1].item()) tokens_to_generate = int(input_info_tensor[2].item()) all_probs = bool(input_info_tensor[3].item()) - temperature = float(input_info_tensor[4].item()) - top_k = int(input_info_tensor[5].item()) - top_p = float(input_info_tensor[6].item()) - greedy = bool(input_info_tensor[7].item()) - repetition_penalty = float(input_info_tensor[8].item()) - min_tokens_to_generate = int(input_info_tensor[9].item()) + compute_logprob = bool(input_info_tensor[4].item()) # whether to compute log probabilities matrix + temperature = float(input_info_tensor[5].item()) + top_k = int(input_info_tensor[6].item()) + top_p = float(input_info_tensor[7].item()) + greedy = bool(input_info_tensor[8].item()) + repetition_penalty = float(input_info_tensor[9].item()) + min_tokens_to_generate = int(input_info_tensor[10].item()) context_length_tensor = torch.empty(batch_size, dtype=torch.int64, device=torch.cuda.current_device()) context_tokens_tensor = torch.empty(batch_size, seq_len, dtype=torch.int64, device=torch.cuda.current_device()) @@ -349,6 +354,7 @@ def receive_generate_info(): context_tokens_tensor, tokens_to_generate, all_probs, + compute_logprob, temperature, top_k, top_p, @@ -370,6 +376,7 @@ def synced_generate( top_k=0, top_p=0.0, greedy=False, + compute_logprob=False, repetition_penalty=1.2, min_tokens_to_generate=0, end_strings=[], @@ -394,6 +401,7 @@ def synced_generate( context_length_tensor, tokens_to_generate, all_probs, + compute_logprob=compute_logprob, temperature=temperature, end_strings=end_strings, extra={ @@ -411,7 +419,8 @@ def synced_generate( if parallel_state.is_pipeline_last_stage(): src = parallel_state.get_pipeline_model_parallel_last_rank() group = parallel_state.get_embedding_group() - torch.distributed.broadcast(output_logits, src, group) + if compute_logprob: + torch.distributed.broadcast(output_logits, src, group) if all_probs: src = parallel_state.get_pipeline_model_parallel_last_rank() group = parallel_state.get_embedding_group() @@ -422,15 +431,18 @@ def synced_generate( src = parallel_state.get_pipeline_model_parallel_last_rank() group = parallel_state.get_embedding_group() - precision = model._trainer.precision - if precision in [16, "16"]: - dtype = torch.float16 - elif precision == "bf16": - dtype = torch.bfloat16 - else: - dtype = torch.float32 - output_logits = torch.empty(tokens.size(0), context_length - 1, dtype=dtype, device=torch.device("cuda")) - torch.distributed.broadcast(output_logits, src, group) + if compute_logprob: + precision = model._trainer.precision + if precision in [16, "16"]: + dtype = torch.float16 + elif precision == "bf16": + dtype = torch.bfloat16 + else: + dtype = torch.float32 + output_logits = torch.empty( + tokens.size(0), context_length - 1, dtype=dtype, device=torch.device("cuda") + ) + torch.distributed.broadcast(output_logits, src, group) if all_probs: src = parallel_state.get_pipeline_model_parallel_last_rank() @@ -457,6 +469,7 @@ def generate( top_k=0, top_p=0.0, greedy=False, + compute_logprob=False, repetition_penalty=1.0, min_tokens_to_generate=0, end_strings=['<|endoftext|>'], @@ -504,6 +517,7 @@ def generate( context_length_tensor, tokens_to_generate, all_probs, + compute_logprob, temperature, top_k, top_p, @@ -518,6 +532,7 @@ def generate( context_tokens_tensor, tokens_to_generate, all_probs, + compute_logprob, temperature, top_k, top_p, @@ -535,6 +550,7 @@ def generate( tokens_to_generate, all_probs, temperature, + compute_logprob=compute_logprob, top_k=top_k, top_p=top_p, greedy=greedy, @@ -619,6 +635,7 @@ def sample_sequence_batch( context_lengths, tokens_to_generate, all_probs=False, + compute_logprob=False, type_ids=None, temperature=None, end_strings=['<|endoftext|>'], @@ -673,11 +690,18 @@ def sample_sequence_batch( output = inference_strategy.forward_step(batch, tensor_shape) if parallel_state.is_pipeline_last_stage(): - output = output[0]['logits'] - output = tensor_parallel.gather_from_tensor_model_parallel_region(output) - assert output is not None - logits = output[:, -1].view(batch_size, -1).contiguous() + if compute_logprob: + output = output[0]['logits'] + output = tensor_parallel.gather_from_tensor_model_parallel_region(output) + assert output is not None + logits = output[:, -1].view(batch_size, -1).contiguous() + + else: + logits = output[0]['logits'][:, -1].contiguous() + logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits) + assert logits is not None + logits = logits.view(batch_size, -1) # make sure it will generate at least min_length min_length = extra.get('min_tokens_to_generate', 0) @@ -689,6 +713,7 @@ def sample_sequence_batch( logits[:, tokenizer.vocab_size :] = -float('Inf') # started indicates whether the current token step passes the context_length, so we make sure not to overwrite the context tokens + started = context_lengths <= context_length if extra.get('greedy', False): prev = torch.argmax(logits, dim=-1).view(-1) @@ -716,23 +741,25 @@ def sample_sequence_batch( # Insert either new predicted or next prompt token tokens[:, context_length] = new_tokens - if output_logits is None: - output = F.log_softmax(output[:, :context_length, :], 2) - indices = torch.unsqueeze(tokens[:, 1 : context_length + 1], 2) - output_logits = torch.gather(output, 2, indices).squeeze(2) - all_generated_indices = indices[:, :, 0] - if all_probs: - full_logits = output - else: - output = F.log_softmax(output, 2) - indices = torch.unsqueeze(new_tokens, 1).unsqueeze(2) - new_output_logits = torch.gather(output, 2, indices).squeeze(2) + if compute_logprob: + if output_logits is None: + output = F.log_softmax(output[:, :context_length, :], 2) - # TODO(rprenger) we're copying output_logits every time. Should pre-allocate - output_logits = torch.cat([output_logits, new_output_logits], 1) - all_generated_indices = torch.cat([all_generated_indices, indices[:, :, 0]], 1) - if all_probs: - full_logits = torch.cat([full_logits, output], 1) + indices = torch.unsqueeze(tokens[:, 1 : context_length + 1], 2) + output_logits = torch.gather(output, 2, indices).squeeze(2) + all_generated_indices = indices[:, :, 0] + if all_probs: + full_logits = output + else: + output = F.log_softmax(output, 2) + indices = torch.unsqueeze(new_tokens, 1).unsqueeze(2) + new_output_logits = torch.gather(output, 2, indices).squeeze(2) + + # TODO(rprenger) we're copying output_logits every time. Should pre-allocate + output_logits = torch.cat([output_logits, new_output_logits], 1) + all_generated_indices = torch.cat([all_generated_indices, indices[:, :, 0]], 1) + if all_probs: + full_logits = torch.cat([full_logits, output], 1) src = parallel_state.get_pipeline_model_parallel_last_rank() group = parallel_state.get_embedding_group() @@ -752,10 +779,13 @@ def sample_sequence_batch( src = parallel_state.get_pipeline_model_parallel_last_rank() group = parallel_state.get_pipeline_model_parallel_group() torch.distributed.broadcast(done, src, group) - if all_probs: - yield tokens, lengths, output_logits, full_logits + if compute_logprob: + if all_probs: + yield tokens, lengths, output_logits, full_logits + else: + yield tokens, lengths, output_logits, None else: - yield tokens, lengths, output_logits, None + yield tokens, lengths, None, None else: if parallel_state.is_pipeline_first_stage(): From 010a0e68675d16abe5d4670c7c0b2d093934bfb1 Mon Sep 17 00:00:00 2001 From: tbartley94 <90423858+tbartley94@users.noreply.github.com> Date: Tue, 6 Jun 2023 02:28:31 -0400 Subject: [PATCH 020/123] Fixed bug in MaskedSpecAug that overestimates samples. (#6775) Signed-off-by: tbartley94 --- nemo/collections/asr/modules/audio_preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/asr/modules/audio_preprocessing.py b/nemo/collections/asr/modules/audio_preprocessing.py index fbd05cb1809b..91c0c10b9604 100644 --- a/nemo/collections/asr/modules/audio_preprocessing.py +++ b/nemo/collections/asr/modules/audio_preprocessing.py @@ -608,7 +608,7 @@ def forward(self, input_spec, length): for idx in range(input_spec.shape[0]): cur_len = length[idx] - patches = range(cur_len // self.patch_size - 1) + patches = range(cur_len // self.patch_size) masked_patches = random.sample(patches, mask_patches) for mp in masked_patches: From 8c26464620cfcb0424b576f4102c3abb0130afa2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 6 Jun 2023 09:50:03 -0700 Subject: [PATCH 021/123] update core version (#6817) (#6819) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index fdd311ba4a59..f3afda26d2bf 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -61,7 +61,7 @@ pipeline { steps { sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout e6d7e09845590d0a36bc7f29eb28db974fb8da4e && \ + git checkout d2891b4ad3a00e3c4223f89491afd9e1b812f9b5 && \ pip install -e .' } } From acf50f42712aa393a9a550a6af7e426f2e4372bf Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Tue, 6 Jun 2023 10:16:50 -0700 Subject: [PATCH 022/123] lora pp2 (#6818) Signed-off-by: arendu --- Jenkinsfile | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index f3afda26d2bf..27537b53a557 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -3381,6 +3381,41 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' sh "rm -rf examples/nlp/language_modeling/gpt_sft_results" } } + stage('L2: Megatron GPT PEFT Lora PP=2') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + steps { + sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2" + sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.max_epochs=9999 \ + trainer.max_steps=3 \ + trainer.val_check_interval=3 \ + ++trainer.limit_val_batches=2 \ + trainer.precision=16 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \ + model.pipeline_model_parallel_size=2 \ + model.tensor_model_parallel_size=1 \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ + model.peft.peft_scheme='lora' \ + model.answer_only_loss=True \ + model.micro_batch_size=1 \ + model.global_batch_size=4 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.train_ds.concat_sampling_probabilities=[1.0] \ + model.data.train_ds.num_workers=0 \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.names=[quarel]" + sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2" + } + } stage('L2: Megatron GPT PEFT Lora TP=2') { when { anyOf { From 04628ca8d38ebdf365b29cd5c12479068a9aafd2 Mon Sep 17 00:00:00 2001 From: Greg Heinrich Date: Tue, 6 Jun 2023 20:20:50 +0200 Subject: [PATCH 023/123] Add optional index mapping dir in mmap text datasets (#6683) If datasets are stored on a read-only medium, index files cannot be created into adjacent files and an alternative directory must be specified for index mapping files. This commit adds an optional `index_mapping_dir` to the constructors. Unit tests are also added. [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Update path formatting for relative paths Signed-off-by: Greg Heinrich --- .../megatron/gpt_sft_dataset.py | 6 +- .../language_modeling/text_memmap_dataset.py | 146 +++++++++++++++--- tests/collections/nlp/test_mem_map_dataset.py | 113 ++++++++++++++ 3 files changed, 238 insertions(+), 27 deletions(-) create mode 100644 tests/collections/nlp/test_mem_map_dataset.py diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py index f9ef6c8470c2..94c4b3c54c63 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py @@ -51,7 +51,7 @@ def __init__( file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece). max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated. - min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. + min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. add_bos (bool): Whether to add a beginning of sentence token to each data example add_eos (bool): Whether to add an end of sentence token to each data example add_sep (bool): Whether to add a separation token to each data example (goes between prompt and answer) @@ -93,7 +93,9 @@ def __init__( self.prompt_template = self.prompt_template.encode('utf-8').decode('unicode_escape') assert self.truncation_field in ["answer", "context"] - self.indexed_dataset = JSONLMemMapDataset(dataset_paths=[file_path], tokenizer=None, header_lines=0) + self.indexed_dataset = JSONLMemMapDataset( + dataset_paths=[file_path], tokenizer=None, header_lines=0, index_mapping_dir=index_mapping_dir + ) # Will be None after this call if `max_num_samples` is None self._build_samples_mapping() diff --git a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py index b26f213282bb..e1a30a3aafb7 100644 --- a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py @@ -19,6 +19,7 @@ import pickle import time from functools import partial +from typing import Callable, List, Optional, Type import numpy as np import torch @@ -35,7 +36,7 @@ def _build_index_from_memdata(fn, newline_int): """ Build index of delimiter positions between samples in memmap. Can be provided externally. - + Returns a 1D array of ints. """ # use memmap to read file @@ -68,17 +69,28 @@ class TextMemMapDataset(Dataset): def __init__( self, - dataset_paths, - newline_int=10, - header_lines=0, - workers=None, - tokenizer=None, - sort_dataset_paths=True, - build_index_fn=_build_index_from_memdata, + dataset_paths: List[str], + newline_int: Optional[int] = 10, + header_lines: Optional[int] = 0, + workers: Optional[int] = None, + tokenizer: Optional[Type["TokenizerSpec"]] = None, + build_index_fn: Optional[Callable[[str, Optional[int]], bool]] = _build_index_from_memdata, + sort_dataset_paths: Optional[bool] = True, + index_mapping_dir: Optional[str] = None, ): """ - build_index_fn - a callable build_index_fn(fn, newline_int) -> midx [np.array] that returns the index of newlines in a file fn - must be pickleable (to be used in multiprocessing.Pool.map) + Args: + dataset_paths: list of JSONL file paths. + newline_int: ASCII code to use to interpret newlines in file. + header_lines: number of header lines in JSON files. + workers: number of workers to use for creating index files. + tokenizer: tokenizer to use to convert text to tokens. + build_index_fn: a callable build_index_fn(fn, newline_int) -> midx [np.array] + that returns the index of newlines in a file fn must be pickleable + (to be used in multiprocessing.Pool.map). + sort_dataset_paths: whether to sort datasets by paths. + index_mapping_dir: directory to save the index mapping to. + If None, will write to the same folder as the dataset. """ super().__init__() self.mdata_midx_list = [] @@ -106,14 +118,20 @@ def __init__( is_ditributed = torch.distributed.is_available() and torch.distributed.is_initialized() if not is_ditributed or (is_ditributed and torch.distributed.get_rank() == 0): - build_index_files(dataset_paths, newline_int, workers=self._worker, build_index_fn=build_index_fn) + build_index_files( + dataset_paths, + newline_int, + workers=self._worker, + build_index_fn=build_index_fn, + index_mapping_dir=index_mapping_dir, + ) if is_ditributed: torch.distributed.barrier() logging.info(f"Loading data files") start_time = time.time() - mdata_midx_list = [self.load_file(fn) for fn in self._files_list] + mdata_midx_list = [self.load_file(fn, index_mapping_dir) for fn in self._files_list] logging.info( f'Time loading {len(mdata_midx_list)} mem-mapped files: {datetime.timedelta(seconds=time.time() - start_time)}' ) @@ -193,7 +211,7 @@ def _build_data_from_text(self, text): return data - def load_file(self, fn): + def load_file(self, fn, index_mapping_dir: Optional[str] = None): """ Loads a text file as np.int8. @@ -203,7 +221,7 @@ def load_file(self, fn): size - number of lines in file """ logging.info(f"Loading {fn}") - idx_fn = f"{fn}.{__idx_suffix__}" + idx_fn = _index_fn(fn, index_mapping_dir) # create data map mdata = np.memmap(fn, dtype=np.uint8, mode='r') @@ -246,15 +264,29 @@ class CSVMemMapDataset(TextMemMapDataset): def __init__( self, - dataset_paths, - newline_int=10, - header_lines=1, - workers=None, - tokenizer=None, - sort_dataset_paths=True, + dataset_paths: List[str], + newline_int: Optional[int] = 10, + header_lines: Optional[int] = 0, + workers: Optional[int] = None, + tokenizer: Optional[Type["TokenizerSpec"]] = None, + sort_dataset_paths: Optional[bool] = True, data_col=1, data_sep=',', + index_mapping_dir: Optional[str] = None, ): + """ + Args: + dataset_paths: list of JSONL file paths. + newline_int: ASCII code to use to interpret newlines in file. + header_lines: number of header lines in JSON files. + workers: number of workers to use for creating index files. + tokenizer: tokenizer to use to convert text to tokens. + sort_dataset_paths: whether to sort datasets by paths. + data_col: index of data column. + data_sep: data separator. + index_mapping_dir: directory to save the index mapping to. + If None, will write to the same folder as the dataset. + """ super().__init__( dataset_paths=dataset_paths, newline_int=newline_int, @@ -262,6 +294,7 @@ def __init__( workers=workers, tokenizer=tokenizer, sort_dataset_paths=sort_dataset_paths, + index_mapping_dir=index_mapping_dir, ) self._data_col = data_col self._data_sep = data_sep @@ -280,8 +313,26 @@ class JSONLMemMapDataset(TextMemMapDataset): """ def __init__( - self, dataset_paths, newline_int=10, header_lines=1, workers=None, tokenizer=None, sort_dataset_paths=True, + self, + dataset_paths: List[str], + newline_int: Optional[int] = 10, + header_lines: Optional[int] = 0, + workers: Optional[int] = None, + tokenizer: Optional[Type["TokenizerSpec"]] = None, + sort_dataset_paths: Optional[bool] = True, + index_mapping_dir: Optional[str] = None, ): + """ + Args: + dataset_paths: list of JSONL file paths. + newline_int: ASCII code to use to interpret newlines in file. + header_lines: number of header lines in JSON files. + workers: number of workers to use for creating index files. + tokenizer: tokenizer to use to convert text to tokens. + sort_dataset_paths: whether to sort datasets by paths. + index_mapping_dir: directory to save the index mapping to. + If None, will write to the same folder as the dataset. + """ super().__init__( dataset_paths=dataset_paths, newline_int=newline_int, @@ -289,6 +340,7 @@ def __init__( workers=workers, tokenizer=tokenizer, sort_dataset_paths=sort_dataset_paths, + index_mapping_dir=index_mapping_dir, ) def _build_data_from_text(self, text): @@ -304,9 +356,48 @@ def _index_file_exists(idx_fn): return False -def _build_memmap_index_files(newline_int, build_index_fn, fn): +def _index_fn(fn: str, index_mapping_dir: str) -> str: + """Return base file name of index files. + + This returns the base file name associated with specified index + files. This base name is the base on top of which suffixes + like .npy or .info are added. + + The parent directory is created if it does not already exist. + + fn may be specified in multiple ways: + 1. file name: data.jsonl, + 2. relative path to a file: relative/path/to/data.jsonl, + 3. absolute path to a file: /absolute/path/to/data.jsonl. + + This function returns paths in the pattern of: + 1. /path/to/input_mapping_dir/data.jsonl.idx + 2. /path/to/input_mapping_dir/relative/path/to/data.jsonl.idx + 3. /path/to/input_mapping_dir/absolute/path/to/data.jsonl.idx + + Args: + fn: filename to get base name for. + index_mapping_dir: directory to save the index mapping to. + If None, will write to the same folder as the dataset. + """ + if index_mapping_dir: + # Remove leading "/" and "..". + while fn.startswith(("/", "..")): + if fn.startswith(".."): + fn = fn.lstrip("..") + if fn.startswith("/"): + fn = fn.lstrip("/") + idx_fn = f"{os.path.join(index_mapping_dir, fn)}.{__idx_suffix__}" + # Create parent directory if needed. + os.makedirs(os.path.dirname(idx_fn), exist_ok=True) + else: + idx_fn = f"{fn}.{__idx_suffix__}" + return idx_fn + + +def _build_memmap_index_files(newline_int, build_index_fn, fn, index_mapping_dir: str): """Helper function to build an index file""" - idx_fn = f"{fn}.{__idx_suffix__}" + idx_fn = _index_fn(fn, index_mapping_dir) # create data map if _index_file_exists(idx_fn): @@ -332,7 +423,9 @@ def _build_memmap_index_files(newline_int, build_index_fn, fn): return True -def build_index_files(dataset_paths, newline_int, workers=None, build_index_fn=_build_index_from_memdata): +def build_index_files( + dataset_paths, newline_int, workers=None, build_index_fn=_build_index_from_memdata, index_mapping_dir: str = None +): """Auxiliary method to build multiple index files""" if len(dataset_paths) < 1: raise ValueError("files_list must contain at leat one file name") @@ -344,7 +437,10 @@ def build_index_files(dataset_paths, newline_int, workers=None, build_index_fn=_ # load all files into memmap start_time = time.time() with mp.Pool(workers) as p: - build_status = p.map(partial(_build_memmap_index_files, newline_int, build_index_fn), dataset_paths) + build_status = p.map( + partial(_build_memmap_index_files, newline_int, build_index_fn, index_mapping_dir=index_mapping_dir), + dataset_paths, + ) logging.info( f'Time building {sum(build_status)} / {len(build_status)} mem-mapped files: {datetime.timedelta(seconds=time.time() - start_time)}' diff --git a/tests/collections/nlp/test_mem_map_dataset.py b/tests/collections/nlp/test_mem_map_dataset.py new file mode 100644 index 000000000000..b60636022e05 --- /dev/null +++ b/tests/collections/nlp/test_mem_map_dataset.py @@ -0,0 +1,113 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import json +import os + +import pytest + +from nemo.collections.nlp.data.language_modeling import text_memmap_dataset + + +@pytest.fixture +def jsonl_file(tmp_path): + # Create a temporary file path + file_path = tmp_path / "data.jsonl" + + # Generate data to write to the JSONL file + data = [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}, {"name": "Bob", "age": 35}] + + # Write data to the JSONL file + with open(file_path, mode="w") as file: + for item in data: + json.dump(item, file) + file.write('\n') + + # Provide the file path to the test function + yield str(file_path) + + # Optional: Clean up the temporary file after the test + file_path.unlink() + + +@pytest.fixture +def csv_file(tmp_path): + # Create a temporary file path + file_path = tmp_path / "data.csv" + + # Generate data to write to the CSV file + data = [["ID", "Name"], [1, "John"], [2, "Jane"], [3, "Bob"]] + + # Write data to the CSV file + with open(file_path, mode="w", newline="") as file: + writer = csv.writer(file) + writer.writerows(data) + + # Provide the file path to the test function + yield str(file_path) + + # Optional: Clean up the temporary file after the test + file_path.unlink() + + +def test_jsonl_mem_map_dataset(jsonl_file): + """Test for JSONL memory-mapped datasets.""" + + indexed_dataset = text_memmap_dataset.JSONLMemMapDataset(dataset_paths=[jsonl_file], header_lines=0) + assert indexed_dataset[0] == {"name": "John", "age": 30} + assert indexed_dataset[1] == {"name": "Jane", "age": 25} + assert indexed_dataset[2] == {"name": "Bob", "age": 35} + + +def test_csv_mem_map_dataset(csv_file): + """Test for CSV memory-mapped datasets.""" + + indexed_dataset = text_memmap_dataset.CSVMemMapDataset(dataset_paths=[csv_file], data_col=1, header_lines=1) + assert indexed_dataset[0].strip() == "John" + assert indexed_dataset[1].strip() == "Jane" + assert indexed_dataset[2].strip() == "Bob" + + +@pytest.mark.parametrize( + "dataset_class", [text_memmap_dataset.JSONLMemMapDataset, text_memmap_dataset.CSVMemMapDataset] +) +@pytest.mark.parametrize("use_alternative_index_mapping_dir", [True, False]) +@pytest.mark.parametrize("relative_index_fn", [True, False]) +def test_mem_map_dataset_index_mapping_dir( + tmp_path, dataset_class, jsonl_file, use_alternative_index_mapping_dir, relative_index_fn +): + """Test for index_mapping_dir.""" + if relative_index_fn: + jsonl_file = os.path.relpath(jsonl_file) + else: + jsonl_file = os.path.abspath(jsonl_file) + + if use_alternative_index_mapping_dir: + index_mapping_dir = tmp_path / "subdir" + dataset_class(dataset_paths=[jsonl_file], header_lines=0, index_mapping_dir=str(index_mapping_dir)) + # Index files should not be created in default location. + assert not os.path.isfile(f"{jsonl_file}.idx.npy") + assert not os.path.isfile(f"{jsonl_file}.idx.info") + if relative_index_fn: + # Remove leading ".." sequences. + while jsonl_file.startswith(("../")): + jsonl_file = jsonl_file.lstrip("../") + idx_fn = f"{str(index_mapping_dir)}/{jsonl_file}.idx" + assert os.path.isfile(f"{idx_fn}.npy") + assert os.path.isfile(f"{idx_fn}.info") + else: + text_memmap_dataset.JSONLMemMapDataset(dataset_paths=[jsonl_file], header_lines=0) + assert os.path.isfile(f"{jsonl_file}.idx.npy") + assert os.path.isfile(f"{jsonl_file}.idx.info") From bf270794267e0240d8a8b2f2514c80c6929c76f1 Mon Sep 17 00:00:00 2001 From: Yen-Shi Wang <6960565+yen-shi@users.noreply.github.com> Date: Tue, 6 Jun 2023 11:30:40 -0700 Subject: [PATCH 024/123] Add inference kv cache support for transformer TE path (#6627) * Add kv cache support for transformer TE path Signed-off-by: Yen-Shi Wang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Mark get_data_parallel_group as WAR Signed-off-by: Yen-Shi Wang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Initialize process group for FP8 training Signed-off-by: Tim Moon * Update Megatron GPT eval script for non-FP8 path Signed-off-by: Yen-Shi Wang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Yen-Shi Wang Signed-off-by: Tim Moon Signed-off-by: Yen-Shi Wang <6960565+yen-shi@users.noreply.github.com> Co-authored-by: Yen-Shi Wang Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tim Moon Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: Eric Harper --- .../language_modeling/megatron_gpt_eval.py | 23 +++++++++++++++- .../modules/common/megatron/transformer.py | 26 +++++++++++++++---- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index d7319fb72a01..b33cdefc6df2 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -154,6 +154,15 @@ def __getitem__(self, idx): return self.sentences[idx] +def remove_padded_prompts(response, nb_paddings): + result = {} + for k, v in response.items(): + if v != None and (type(v) is list or type(v) is torch.Tensor): + v = v[:-nb_paddings] + result[k] = v + return result + + @hydra_runner(config_path="conf", config_name="megatron_gpt_inference") def main(cfg) -> None: @@ -254,22 +263,34 @@ def main(cfg) -> None: "compute_logprob": cfg.inference.compute_logprob, } + fp8_enabled = hasattr(model.cfg, "fp8") and (model.cfg.fp8 == True) + if fp8_enabled: + nb_paddings = 0 + while len(cfg.prompts) % 8 != 0: + cfg.prompts.append("") + nb_paddings += 1 + # First method of running text generation, call model.generate method response = model.generate( inputs=OmegaConf.to_container(cfg.prompts), length_params=length_params, sampling_params=sampling_params ) + if fp8_enabled: + response = remove_padded_prompts(response, nb_paddings) print("***************************") print(response) print("***************************") # Second method of running text generation, call trainer.predict [recommended] + bs = 8 if fp8_enabled else 2 ds = RequestDataSet(OmegaConf.to_container(cfg.prompts)) - request_dl = DataLoader(dataset=ds, batch_size=2) + request_dl = DataLoader(dataset=ds, batch_size=bs) config = OmegaConf.to_container(cfg.inference) model.set_inference_config(config) response = trainer.predict(model, request_dl) + if fp8_enabled: + response[-1] = remove_padded_prompts(response[-1], nb_paddings) print("***************************") print(response) print("***************************") diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index 9a09a9f9aa0b..f5dfbcabcd0e 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -945,6 +945,9 @@ def __init__( self.position_embedding_type = position_embedding_type self.multi_query_attention = multi_query_attention + self.inference_current_sequence_len = 0 + self.inference_params = None + self.activations_checkpoint_method = activations_checkpoint_method self.activations_checkpoint_num_layers = activations_checkpoint_num_layers self.activations_checkpoint_granularity = activations_checkpoint_granularity @@ -1451,6 +1454,20 @@ def forward( if get_key_value: presents = [] + if self.transformer_engine: + # Pass key value information to TE through inference_params to pre-allocate memory + if set_inference_key_value_memory: + self.inference_params = type('', (), {})() + self.inference_params.max_sequence_len = inference_max_sequence_len + self.inference_params.max_batch_size = hidden_states.size(1) + self.inference_params.batch_size_offset = 0 + self.inference_params.key_value_memory_dict = {} + self.inference_params.sequence_len_offset = 0 + self.inference_current_sequence_len = 0 + + if self.inference_params != None: + self.inference_params.sequence_len_offset = self.inference_current_sequence_len + for index in range(self.num_layers): layer = self._get_layer(index) past = None @@ -1479,19 +1496,15 @@ def forward( checkpoint_core_attention = False if self.transformer_engine: - - inference_params = None - hidden_states = layer( hidden_states, attention_mask, encoder_output=encoder_output, enc_dec_attn_mask=enc_dec_attn_mask, - inference_params=inference_params, + inference_params=self.inference_params, is_first_microbatch=self.is_first_microbatch, checkpoint_core_attention=checkpoint_core_attention, ) - else: hidden_states = layer( hidden_states, @@ -1507,6 +1520,9 @@ def forward( cross_attention_relative_position_bias=cross_attention_relative_position_bias, checkpoint_core_attention=checkpoint_core_attention, ) + # Update current sequence length outside of the loops + if self.transformer_engine: + self.inference_current_sequence_len += hidden_states.size(0) # Skip counter update for eval and activation checkpointing if torch.is_grad_enabled() and self.training: From 336372c8f398a93701999677f83e924edc31f613 Mon Sep 17 00:00:00 2001 From: Dima Rekesh Date: Wed, 7 Jun 2023 14:09:54 -0700 Subject: [PATCH 025/123] Support large inputs to Conformer and Fast Conformer (#6556) * initial commit Signed-off-by: Dima Rekesh * typos Signed-off-by: Dima Rekesh * tweaks to padding Signed-off-by: Dima Rekesh * comments Signed-off-by: Dima Rekesh * attempt at first working version Signed-off-by: Dima Rekesh * typos and fixed p calculation Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removing merge artifacts Signed-off-by: Dima Rekesh * typo Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removing unnessary imports Signed-off-by: Dima Rekesh * if batch split succeeded no need to conv again Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * adding channel wise split Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * adding reference to pytorch issue 80020 Signed-off-by: Dima Rekesh * removing time chunking methods Signed-off-by: Dima Rekesh * accounting for the actual self._stride value Signed-off-by: Dima Rekesh * limiting the fix to dw_striding subsampling Signed-off-by: Dima Rekesh * renamed methods Signed-off-by: Dima Rekesh * one more accounting for the actual self._stride value Signed-off-by: Dima Rekesh * support for causal convs Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * option to set conv chunking size manually * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixing imports * subsampling test Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rename variable Signed-off-by: Dima Rekesh * imports in test Signed-off-by: Dima Rekesh * more runtime checks * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * a more careful test Signed-off-by: Dima Rekesh * bug in causal Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix in causal Signed-off-by: Dima Rekesh * change_conv_chunking_factor methods Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * renamed methods Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * disabling chunking by default Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * typo Signed-off-by: Dima Rekesh * changing default chunking to auto Signed-off-by: Dima Rekesh * only split if needed Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * only split if needed Signed-off-by: Dima Rekesh --------- Signed-off-by: Dima Rekesh Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../asr/modules/conformer_encoder.py | 24 +++ nemo/collections/asr/parts/mixins/mixins.py | 28 ++++ .../asr/parts/submodules/subsampling.py | 151 +++++++++++++++++- tests/collections/asr/test_asr_subsampling.py | 61 +++++++ 4 files changed, 262 insertions(+), 2 deletions(-) create mode 100644 tests/collections/asr/test_asr_subsampling.py diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py index df5b8f5c69ed..7c786f9c9720 100644 --- a/nemo/collections/asr/modules/conformer_encoder.py +++ b/nemo/collections/asr/modules/conformer_encoder.py @@ -67,6 +67,8 @@ class ConformerEncoder(NeuralModule, StreamingEncoder, Exportable, AccessMixin): Defaults to striding. subsampling_factor (int): the subsampling factor which should be power of 2 Defaults to 4. + subsampling_conv_chunking_factor(int): optionally, force chunk inputs (helpful for large inputs) + Should be power of 2, 1 (auto-chunking, default), or -1 (no chunking) subsampling_conv_channels (int): the size of the convolutions in the subsampling module Defaults to -1 which would set it to d_model. reduction (str, Optional): the method of reduction, choices=['pooling', 'striding']. If no value @@ -245,6 +247,7 @@ def __init__( causal_downsampling=False, subsampling='striding', subsampling_factor=4, + subsampling_conv_chunking_factor=1, subsampling_conv_channels=-1, reduction=None, reduction_position=None, @@ -279,6 +282,7 @@ def __init__( self.scale = math.sqrt(self.d_model) self.att_context_style = att_context_style self.subsampling_factor = subsampling_factor + self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor self.self_attention_model = self_attention_model self.global_tokens = global_tokens @@ -355,6 +359,7 @@ def __init__( feat_in=feat_in, feat_out=d_model, conv_channels=subsampling_conv_channels, + subsampling_conv_chunking_factor=subsampling_conv_chunking_factor, activation=nn.ReLU(True), is_causal=causal_downsampling, ) @@ -977,6 +982,25 @@ def change_attention_model( self._cfg.self_attention_model = self_attention_model self._cfg.att_context_size = att_context_size + def change_subsampling_conv_chunking_factor(self, subsampling_conv_chunking_factor: int): + """ + Update the conv_chunking_factor (int) + Default is 1 (auto) + Set it to -1 (disabled) or to a specific value (power of 2) if you OOM in the conv subsampling layers + + + Args: + subsampling_conv_chunking_factor (int) + """ + + if not hasattr(self.pre_encode, "change_subsampling_conv_chunking_factor"): + logging.info("Model pre_encoder doesn't have a change_subsampling_conv_chunking_factor method ") + return + + self.pre_encode.change_subsampling_conv_chunking_factor( + subsampling_conv_chunking_factor=subsampling_conv_chunking_factor + ) + class ConformerEncoderAdapter(ConformerEncoder, adapter_mixins.AdapterModuleMixin): diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py index eba896d0478d..4c43960ac9d2 100644 --- a/nemo/collections/asr/parts/mixins/mixins.py +++ b/nemo/collections/asr/parts/mixins/mixins.py @@ -432,6 +432,34 @@ def change_attention_model( self.cfg.encoder.self_attention_model = self_attention_model self.cfg.encoder.att_context_size = att_context_size + def change_subsampling_conv_chunking_factor( + self, subsampling_conv_chunking_factor: int, update_config: bool = True + ): + """ + Update the conv_chunking_factor (int) if function is available in encoder. + Default is 1 (auto) + Set it to -1 (disabled) or to a specific value (power of 2) if you OOM in the conv subsampling layers + + Args: + conv_chunking_factor (int) + """ + + if not hasattr(self, 'encoder'): + logging.info( + "Could not call the change_subsampling_conv_chunking_factor method in encoder " + "since the model provided does not contain an `encoder` module in its config." + ) + return + + if not hasattr(self.encoder, "change_subsampling_conv_chunking_factor"): + logging.info("Model encoder doesn't have a change_subsampling_conv_chunking_factor method ") + return + + self.encoder.change_subsampling_conv_chunking_factor(subsampling_conv_chunking_factor) + if update_config: + with open_dict(self.cfg): + self.cfg.encoder.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor + def conformer_stream_step( self, processed_signal: torch.Tensor, diff --git a/nemo/collections/asr/parts/submodules/subsampling.py b/nemo/collections/asr/parts/submodules/subsampling.py index 4358d09977fe..23bd625108c7 100644 --- a/nemo/collections/asr/parts/submodules/subsampling.py +++ b/nemo/collections/asr/parts/submodules/subsampling.py @@ -19,6 +19,7 @@ from torch.nn import LayerNorm from nemo.collections.asr.parts.submodules.causal_convs import CausalConv2D +from nemo.utils import logging class StackingSubsampling(torch.nn.Module): @@ -65,6 +66,8 @@ class ConvSubsampling(torch.nn.Module): Args: subsampling (str): The subsampling technique from {"vggnet", "striding"} subsampling_factor (int): The subsampling factor which should be a power of 2 + subsampling_conv_chunking_factor (int): Input chunking factor which can be -1 (no chunking) + 1 (auto) or a power of 2. Default is 1 feat_in (int): size of the input features feat_out (int): size of the output features conv_channels (int): Number of channels for the convolution layers. @@ -72,7 +75,15 @@ class ConvSubsampling(torch.nn.Module): """ def __init__( - self, subsampling, subsampling_factor, feat_in, feat_out, conv_channels, activation=nn.ReLU(), is_causal=False + self, + subsampling, + subsampling_factor, + feat_in, + feat_out, + conv_channels, + subsampling_conv_chunking_factor=1, + activation=nn.ReLU(), + is_causal=False, ): super(ConvSubsampling, self).__init__() self._subsampling = subsampling @@ -86,6 +97,14 @@ def __init__( self.subsampling_factor = subsampling_factor self.is_causal = is_causal + if ( + subsampling_conv_chunking_factor != -1 + and subsampling_conv_chunking_factor != 1 + and subsampling_conv_chunking_factor % 2 != 0 + ): + raise ValueError("subsampling_conv_chunking_factor should be -1, 1, or a power of 2") + self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor + in_channels = 1 layers = [] @@ -264,7 +283,32 @@ def forward(self, x, lengths): ) x = x.unsqueeze(1) - x = self.conv(x) + # split inputs if chunking_factor is set + if self.subsampling_conv_chunking_factor != -1: + if self.subsampling_conv_chunking_factor == 1: + # if subsampling_conv_chunking_factor is 1, we split only if needed + # avoiding a bug / feature limiting indexing of tensors to 2**31 + # see https://github.com/pytorch/pytorch/issues/80020 + x_ceil = 2 ** 31 / self._conv_channels * self._stride * self._stride + if torch.numel(x) > x_ceil: + need_to_split = True + else: + need_to_split = False + else: + # if subsampling_conv_chunking_factor > 1 we always split + need_to_split = True + + if need_to_split: + x, success = self.conv_split_by_batch(x) + if not success: # if unable to split by batch, try by channel + if self._subsampling == 'dw_striding': + x = self.conv_split_by_channel(x) + else: + x = self.conv(x) # try anyway + else: + x = self.conv(x) + else: + x = self.conv(x) b, c, t, f = x.size() x = self.out(x.transpose(1, 2).reshape(b, t, -1)) @@ -293,6 +337,109 @@ def reset_parameters(self): torch.nn.init.uniform_(self.out.weight, -fc_scale, fc_scale) torch.nn.init.uniform_(self.out.bias, -fc_scale, fc_scale) + def conv_split_by_batch(self, x): + """ Tries to split input by batch, run conv and concat results """ + b, _, _, _ = x.size() + if b == 1: # can't split if batch size is 1 + return x, False + + if self.subsampling_conv_chunking_factor > 1: + cf = self.subsampling_conv_chunking_factor + logging.debug(f'using manually set chunking factor: {cf}') + else: + # avoiding a bug / feature limiting indexing of tensors to 2**31 + # see https://github.com/pytorch/pytorch/issues/80020 + x_ceil = 2 ** 31 / self._conv_channels * self._stride * self._stride + p = math.ceil(math.log(torch.numel(x) / x_ceil, 2)) + cf = 2 ** p + logging.debug(f'using auto set chunking factor: {cf}') + + new_batch_size = b // cf + if new_batch_size == 0: # input is too big + return x, False + + logging.debug(f'conv subsampling: using split batch size {new_batch_size}') + return torch.cat([self.conv(chunk) for chunk in torch.split(x, new_batch_size, 0)]), True + + def conv_split_by_channel(self, x): + """ For dw convs, tries to split input by time, run conv and concat results """ + x = self.conv[0](x) # full conv2D + x = self.conv[1](x) # activation + + for i in range(self._sampling_num - 1): + _, c, t, _ = x.size() + + if self.subsampling_conv_chunking_factor > 1: + cf = self.subsampling_conv_chunking_factor + logging.debug(f'using manually set chunking factor: {cf}') + else: + # avoiding a bug / feature limiting indexing of tensors to 2**31 + # see https://github.com/pytorch/pytorch/issues/80020 + p = math.ceil(math.log(torch.numel(x) / 2 ** 31, 2)) + cf = 2 ** p + logging.debug(f'using auto set chunking factor: {cf}') + + new_c = int(c // cf) + if new_c == 0: + logging.warning(f'chunking factor {cf} is too high; splitting down to one channel.') + new_c = 1 + + new_t = int(t // cf) + if new_t == 0: + logging.warning(f'chunking factor {cf} is too high; splitting down to one timestep.') + new_t = 1 + + logging.debug(f'conv dw subsampling: using split C size {new_c} and split T size {new_t}') + x = self.channel_chunked_conv(self.conv[i * 3 + 2], new_c, x) # conv2D, depthwise + + # splitting pointwise convs by time + x = torch.cat([self.conv[i * 3 + 3](chunk) for chunk in torch.split(x, new_t, 2)], 2) # conv2D, pointwise + x = self.conv[i * 3 + 4](x) # activation + return x + + def channel_chunked_conv(self, conv, chunk_size, x): + """ Performs channel chunked convolution""" + + ind = 0 + out_chunks = [] + for chunk in torch.split(x, chunk_size, 1): + step = chunk.size()[1] + + if self.is_causal: + chunk = nn.functional.pad( + chunk, pad=(self._kernel_size - 1, self._stride - 1, self._kernel_size - 1, self._stride - 1) + ) + ch_out = nn.functional.conv2d( + chunk, + conv.weight[ind : ind + step, :, :, :], + bias=conv.bias[ind : ind + step], + stride=self._stride, + padding=0, + groups=step, + ) + else: + ch_out = nn.functional.conv2d( + chunk, + conv.weight[ind : ind + step, :, :, :], + bias=conv.bias[ind : ind + step], + stride=self._stride, + padding=self._left_padding, + groups=step, + ) + out_chunks.append(ch_out) + ind += step + + return torch.cat(out_chunks, 1) + + def change_subsampling_conv_chunking_factor(self, subsampling_conv_chunking_factor: int): + if ( + subsampling_conv_chunking_factor != -1 + and subsampling_conv_chunking_factor != 1 + and subsampling_conv_chunking_factor % 2 != 0 + ): + raise ValueError("subsampling_conv_chunking_factor should be -1, 1, or a power of 2") + self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor + def calc_length(lengths, all_paddings, kernel_size, stride, ceil_mode, repeat_num=1): """ Calculates the output length of a Tensor passed through a convolution or max pooling layer""" diff --git a/tests/collections/asr/test_asr_subsampling.py b/tests/collections/asr/test_asr_subsampling.py new file mode 100644 index 000000000000..fe5295be11f1 --- /dev/null +++ b/tests/collections/asr/test_asr_subsampling.py @@ -0,0 +1,61 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import torch + +from nemo.collections.asr.models import ASRModel + + +class TestASRSubsamplingConvChunking: + @pytest.mark.with_downloads() + @pytest.mark.unit + def test_forward(self): + asr_model = ASRModel.from_pretrained("stt_en_fastconformer_ctc_large") + asr_model = asr_model.eval() + asr_model.preprocessor.featurizer.dither = 0.0 + asr_model.preprocessor.featurizer.pad_to = 0 + + len = 512 + + input_signal_batch1 = torch.randn(size=(1, len), device=asr_model.device) + length_batch1 = torch.randint(low=161, high=500, size=[1], device=asr_model.device) + + input_signal_batch4 = torch.randn(size=(4, len), device=asr_model.device) + length_batch4 = torch.randint(low=161, high=500, size=[4], device=asr_model.device) + + with torch.no_grad(): + # regular inference + logprobs_batch1_nosplit, _, _ = asr_model.forward( + input_signal=input_signal_batch1, input_signal_length=length_batch1 + ) + logprobs_batch4_nosplit, _, _ = asr_model.forward( + input_signal=input_signal_batch4, input_signal_length=length_batch4 + ) + + # force chunking to 2 + asr_model.change_subsampling_conv_chunking_factor(subsampling_conv_chunking_factor=2) + + # chunked inference by channels as batch is 1 + logprobs_batch1_split, _, _ = asr_model.forward( + input_signal=input_signal_batch1, input_signal_length=length_batch1 + ) + # chunked inference by batch as it is 4 [> 1] + logprobs_batch4_split, _, _ = asr_model.forward( + input_signal=input_signal_batch4, input_signal_length=length_batch4 + ) + + diff = torch.mean(torch.abs(logprobs_batch1_split - logprobs_batch1_nosplit)) + assert diff <= 1e-6 + diff = torch.max(torch.abs(logprobs_batch4_split - logprobs_batch4_nosplit)) + assert diff <= 1e-6 From ebfcef7902c9232da04ecf90f505c3c996e7e059 Mon Sep 17 00:00:00 2001 From: Dima Rekesh Date: Wed, 7 Jun 2023 15:00:00 -0700 Subject: [PATCH 026/123] sharded_manifests updated docs (#6833) Signed-off-by: Dima Rekesh --- docs/source/asr/datasets.rst | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst index 7ec0f9c61b74..617d5195005f 100644 --- a/docs/source/asr/datasets.rst +++ b/docs/source/asr/datasets.rst @@ -218,11 +218,6 @@ of filepaths, e.g. ``['/data/shard1.tar', '/data/shard2.tar']``, or in a single As with non-tarred datasets, the manifest file should be passed in ``manifest_filepath``. The dataloader assumes that the length of the manifest after filtering is the correct size of the dataset for reporting training progress. -If the manifest is large, you may wish to reference sharded manifest files instead of a single manifest file. The naming convention -is identical to the audio tarballs and there should be a 1:1 relationship between a sharded audio tarfile and its manifest shard; e.g. -``'/data/sharded_manifests/manifest__OP_1..64_CL_'`` in the above example. Using sharded manifests improves job startup times and -decreases memory usage, as each worker only loads manifest shards for the corresponding audio shards instead of the entire manifest. - The ``tarred_shard_strategy`` field of the config file can be set if you have multiple shards and are running an experiment with multiple workers. It defaults to ``scatter``, which preallocates a set of shards per worker which do not change during runtime. Note that this strategy, on specific occasions (when the number of shards is not divisible with ``world_size``), will not sample @@ -242,6 +237,18 @@ see the corresponding class APIs in the `Datasets <./api.html#Datasets>`__ secti applied such that each worker ends up with the same number of files. We currently do not check for this in any dataloader, but the user's program may hang if the shards are uneven. +Sharded Manifests +~~~~~~~~~~~~~~~~~ +If your dataset / manifest is large, you may wish to use sharded manifest files instead of a single manifest file. The naming convention +is identical to the audio tarballs and there should be a 1:1 relationship between a sharded audio tarfile and its manifest shard; e.g. +``'/data/sharded_manifests/manifest__OP_1..64_CL_'`` in the above example. Using sharded manifests improves job startup times and +decreases memory usage, as each worker only loads manifest shards for the corresponding audio shards instead of the entire manifest. + +To enable sharded manifest filename expansion, set the ``shard_manifests`` field of the config file to true. In addition, the +``defer_setup`` flag needs to be true as well, so that the dataloader will be initialized after the DDP and its length can be collected from +the distributed workers. + + Conversion to Tarred Datasets ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 52e23e0d075e2732096aaf1ecac1f5659f1717a9 Mon Sep 17 00:00:00 2001 From: Nithin Rao Date: Wed, 7 Jun 2023 18:20:51 -0700 Subject: [PATCH 027/123] added fc-xl, xxl and titanet-s models (#6832) Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Koluguri --- docs/source/asr/data/benchmark_en.csv | 5 ++++- .../speaker_recognition/data/speaker_results.csv | 1 + .../fastconformer/fast-conformer_ctc_bpe.yaml | 16 ++++++++++++++++ .../fast-conformer_transducer_bpe.yaml | 16 ++++++++++++++++ nemo/collections/asr/models/ctc_bpe_models.py | 7 +++++++ nemo/collections/asr/models/label_models.py | 7 +++++++ nemo/collections/asr/models/rnnt_bpe_models.py | 14 ++++++++++++++ 7 files changed, 65 insertions(+), 1 deletion(-) diff --git a/docs/source/asr/data/benchmark_en.csv b/docs/source/asr/data/benchmark_en.csv index 5f68e9ca22ce..5c764ba38651 100644 --- a/docs/source/asr/data/benchmark_en.csv +++ b/docs/source/asr/data/benchmark_en.csv @@ -28,4 +28,7 @@ stt_en_conformer_transducer_xlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/ca stt_en_conformer_transducer_xxlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_xxlarge" stt_en_fastconformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_large" stt_en_fastconformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_large" -stt_en_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_pc" \ No newline at end of file +stt_en_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_pc" +stt_en_fastconformer_transducer_xlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xlarge" +stt_en_fastconformer_ctc_xlarge,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_xlarge" +stt_en_fastconformer_transducer_xxlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xxlarge" \ No newline at end of file diff --git a/docs/source/asr/speaker_recognition/data/speaker_results.csv b/docs/source/asr/speaker_recognition/data/speaker_results.csv index a0e865c9c487..c92c971e4939 100644 --- a/docs/source/asr/speaker_recognition/data/speaker_results.csv +++ b/docs/source/asr/speaker_recognition/data/speaker_results.csv @@ -1,4 +1,5 @@ Model Name,Model Base Class,Model Card titanet_large,EncDecSpeakerLabelModel,"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_large" +titanet_small,EncDecSpeakerLabelModel,"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_small" speakerverification_speakernet,EncDecSpeakerLabelModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:speakerverification_speakernet" ecapa_tdnn,EncDecSpeakerLabelModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:ecapa_tdnn" \ No newline at end of file diff --git a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml index 8c7561381299..41a8abd93758 100644 --- a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml +++ b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml @@ -17,6 +17,22 @@ # | bf16 | 32GB | 64 | # | | 80GB | 128 | # +-----------+------------+------------+ +# Here are the recommended configs for different variants of FastConformer-CTC-BPE, other parameters are the same as in this config file. +# +# +--------------+---------+---------+----------+----------------+--------------+--------------------------+-----------------+------------+ +# | Model | d_model | n_heads | n_layers |conv_kernel_size| weight_decay | pred_hidden/joint_hidden | pred_rnn_layers | xscaling | +# +==============+=========+========+===========+================+==============+==========================+=================+============+ +# | Small (14M) | 176 | 4 | 16 | 9 | 0.0 | 320 | 1 | True | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | Medium (32M) | 256 | 4 | 16 | 9 | 1e-3 | 640 | 1 | True | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | Large (120M) | 512 | 8 | 17 | 9 | 1e-3 | 640 | 1 | True | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | XLarge (616M)| 1024 | 8 | 24 | 9 | 1e-3 | 640 | 2 | False | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | XXLarge(1.2B)| 1024 | 8 | 42 | 5 | 1e-3 | 640 | 2 | False | +# +--------------------------------------------------------------+--------------+--------------------------+-----------------+------------+ + # Note: They are based on the assumption of max_duration of 20. If you have longer or shorter max_duration, then batch sizes may need to get updated accordingly. # Default learning parameters in this config are set for global batch size of 2K while you may use lower values. diff --git a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml index 0b0ec78e077d..9e3da8d3545f 100644 --- a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml +++ b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml @@ -17,6 +17,22 @@ # | bf16 | 32GB | 64 | # | | 80GB | 128 | # +-----------+------------+------------+ +# Here are the recommended configs for different variants of FastConformer-Transducer-BPE, other parameters are the same as in this config file. +# +# +--------------+---------+---------+----------+----------------+--------------+--------------------------+-----------------+------------+ +# | Model | d_model | n_heads | n_layers |conv_kernel_size| weight_decay | pred_hidden/joint_hidden | pred_rnn_layers | xscaling | +# +==============+=========+========+===========+================+==============+==========================+=================+============+ +# | Small (14M) | 176 | 4 | 16 | 9 | 0.0 | 320 | 1 | True | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | Medium (32M) | 256 | 4 | 16 | 9 | 1e-3 | 640 | 1 | True | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | Large (120M) | 512 | 8 | 17 | 9 | 1e-3 | 640 | 1 | True | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | XLarge (616M)| 1024 | 8 | 24 | 9 | 1e-3 | 640 | 2 | True | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | XXLarge(1.2B)| 1024 | 8 | 42 | 5 | 1e-3 | 640 | 2 | False | +# +--------------------------------------------------------------+--------------+--------------------------+-----------------+------------+ + # Note: They are based on the assumption of max_duration of 20. If you have longer or shorter max_duration, then batch sizes may need to get updated accordingly. # Default learning parameters in this config are set for global batch size of 2K while you may use lower values. diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py index a74c7f3de5c2..7d3b236b2bab 100644 --- a/nemo/collections/asr/models/ctc_bpe_models.py +++ b/nemo/collections/asr/models/ctc_bpe_models.py @@ -606,4 +606,11 @@ def list_available_models(cls) -> List[PretrainedModelInfo]: ) results.append(model) + model = PretrainedModelInfo( + pretrained_model_name="stt_en_fastconformer_ctc_xlarge", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_xlarge", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_xlarge/versions/1.20.0/files/stt_en_fastconformer_ctc_xlarge.nemo", + ) + results.append(model) + return results diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py index aefa8743826b..cc789dacff11 100644 --- a/nemo/collections/asr/models/label_models.py +++ b/nemo/collections/asr/models/label_models.py @@ -92,6 +92,13 @@ def list_available_models(cls) -> List[PretrainedModelInfo]: ) result.append(model) + model = PretrainedModelInfo( + pretrained_model_name="titanet_small", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:titanet_small", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/titanet_small/versions/1.19.0/files/titanet-s.nemo", + ) + result.append(model) + return result def __init__(self, cfg: DictConfig, trainer: Trainer = None): diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py index 6fed8be9d410..9ed38a376103 100644 --- a/nemo/collections/asr/models/rnnt_bpe_models.py +++ b/nemo/collections/asr/models/rnnt_bpe_models.py @@ -253,6 +253,20 @@ def list_available_models(cls) -> List[PretrainedModelInfo]: ) results.append(model) + model = PretrainedModelInfo( + pretrained_model_name="stt_en_fastconformer_transducer_xlarge", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xlarge", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_transducer_xlarge/versions/1.20.1/files/stt_en_fastconformer_transducer_xlarge.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_fastconformer_transducer_xxlarge", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xxlarge", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_transducer_xxlarge/versions/1.20.0/files/stt_en_fastconformer_transducer_xxlarge.nemo", + ) + results.append(model) + return results def __init__(self, cfg: DictConfig, trainer: Trainer = None): From 6903d9be12c7b5882913728663810da9a7f690d0 Mon Sep 17 00:00:00 2001 From: bene-ges Date: Thu, 8 Jun 2023 07:31:59 +0300 Subject: [PATCH 028/123] add reference to our paper (#6821) * add reference to our paper Signed-off-by: Alexandra Antonova * add paper reference to docs Signed-off-by: Alexandra Antonova --------- Signed-off-by: Alexandra Antonova --- docs/source/nlp/nlp_all.bib | 9 ++++ .../nlp/spellchecking_asr_customization.rst | 2 +- .../spellchecking_asr_customization/README.md | 4 +- .../spellchecking_model.py | 1 + ...pellMapper_English_ASR_Customization.ipynb | 42 +++++++++++-------- 5 files changed, 37 insertions(+), 21 deletions(-) diff --git a/docs/source/nlp/nlp_all.bib b/docs/source/nlp/nlp_all.bib index fd0f15f6d1da..48a53240e52b 100644 --- a/docs/source/nlp/nlp_all.bib +++ b/docs/source/nlp/nlp_all.bib @@ -216,3 +216,12 @@ @article{jegou2022faiss pages={ascl--2210}, year={2022} } + +@misc{antonova2023spellmapper, + title={SpellMapper: A non-autoregressive neural spellchecker for ASR customization with candidate retrieval based on n-gram mappings}, + author={Alexandra Antonova and Evelina Bakhturina and Boris Ginsburg}, + year={2023}, + eprint={2306.02317}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} diff --git a/docs/source/nlp/spellchecking_asr_customization.rst b/docs/source/nlp/spellchecking_asr_customization.rst index f9009b520361..c6666c4e338c 100644 --- a/docs/source/nlp/spellchecking_asr_customization.rst +++ b/docs/source/nlp/spellchecking_asr_customization.rst @@ -3,7 +3,7 @@ SpellMapper (Spellchecking ASR Customization) Model ===================================================== -SpellMapper is a non-autoregressive model for postprocessing of ASR output. It gets as input a single ASR hypothesis (text) and a custom vocabulary and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any. Unlike traditional spellchecking approaches, which aim to correct known words using language models, SpellMapper's goal is to correct highly specific user terms, out-of-vocabulary (OOV) words or spelling variations (e.g., "John Koehn", "Jon Cohen"). +`SpellMapper `__ :cite:`nlp-ner-antonova2023spellmapper` is a non-autoregressive model for postprocessing of ASR output. It gets as input a single ASR hypothesis (text) and a custom vocabulary and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any. Unlike traditional spellchecking approaches, which aim to correct known words using language models, SpellMapper's goal is to correct highly specific user terms, out-of-vocabulary (OOV) words or spelling variations (e.g., "John Koehn", "Jon Cohen"). This model is an alternative to word boosting/shallow fusion approaches: diff --git a/examples/nlp/spellchecking_asr_customization/README.md b/examples/nlp/spellchecking_asr_customization/README.md index 2d83fd8d11ad..9d2063eff181 100644 --- a/examples/nlp/spellchecking_asr_customization/README.md +++ b/examples/nlp/spellchecking_asr_customization/README.md @@ -1,6 +1,6 @@ # SpellMapper - spellchecking model for ASR Customization - -This model is inspired by Microsoft's paper https://arxiv.org/pdf/2203.00888.pdf, but does not repeat its implementation. +Paper: https://arxiv.org/abs/2306.02317 +This model was partly inspired by Microsoft's paper https://arxiv.org/pdf/2203.00888.pdf. The goal is to build a model that gets as input a single ASR hypothesis (text) and a vocabulary of custom words/phrases and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any. Our model is non-autoregressive (NAR) based on transformer architecture (BERT with multiple separators). diff --git a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py index fc889de2dc63..15ffb2dd1bcd 100644 --- a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py +++ b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py @@ -43,6 +43,7 @@ @experimental class SpellcheckingAsrCustomizationModel(NLPModel): """ + https://arxiv.org/abs/2306.02317 BERT-based model for Spellchecking ASR Customization. It takes as input ASR hypothesis and candidate customization entries. It labels the hypothesis with correct entry index or 0. diff --git a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb index e11025aeb1d3..cc949ad699b3 100644 --- a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb +++ b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb @@ -1,13 +1,14 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": { "id": "PiRuohn_FQco" }, "source": [ "# Overview\n", - "This tutorial demonstrates how to run inference with SpellMapper - a model for Spellchecking ASR (Automatic Speech Recognition) Customization.\n", + "This tutorial demonstrates how to run inference with [SpellMapper](https://arxiv.org/abs/2306.02317) - a model for Spellchecking ASR (Automatic Speech Recognition) Customization.\n", "\n", "Estimated time: 10-15 min.\n", "\n", @@ -957,25 +958,25 @@ }, { "cell_type": "markdown", - "source": [ - "Free GPU memory to avoid OOM." - ], "metadata": { "id": "bt2TMLLvdUHm" - } + }, + "source": [ + "Free GPU memory to avoid OOM." + ] }, { "cell_type": "code", - "source": [ - "del spectrogram_generator\n", - "del vocoder\n", - "torch.cuda.empty_cache()" - ], + "execution_count": null, "metadata": { "id": "ZwEpAOCaRH7s" }, "outputs": [], - "execution_count": null + "source": [ + "del spectrogram_generator\n", + "del vocoder\n", + "torch.cuda.empty_cache()" + ] }, { "cell_type": "markdown", @@ -1363,22 +1364,27 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "id": "upvTbkFAeYtR" }, "source": [ "# Final notes\n", - "1. Our paper...\n", + "1. Bash-script with example of inference pipeline [run_infer.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/spellchecking_asr_customization/run_infer.sh)\n", "\n", - "2. To reproduce evaluation experiments from this paper see these scripts:\n", + "2. Check our paper: [SpellMapper: A non-autoregressive neural spellchecker for ASR customization with candidate retrieval based on n-gram mappings](https://arxiv.org/abs/2306.02317)\n", + "\n", + "3. To reproduce evaluation experiments from this paper see these scripts:\n", " - [test_on_kensho.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", " - [test_on_userlibri.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", " - [test_on_spoken_wikipedia.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", "\n", - "3. To reproduce training see [README.md](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/README.md)\n", + "4. To reproduce creation of training data see [README.md](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/README.md)\n", + "\n", + "5. To run training see [run_training.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/spellchecking_asr_customization/run_training.sh)\n", "\n", - "4. Promising future research directions would be:\n", + "6. Promising future research directions would be:\n", " - add a simple trainable classifier on top of SpellMapper predictions instead of using multiple thresholds\n", " - retrain with adding more various false positives to the training data" ] @@ -1387,9 +1393,9 @@ "metadata": { "accelerator": "GPU", "colab": { - "toc_visible": true, + "gpuType": "T4", "provenance": [], - "gpuType": "T4" + "toc_visible": true }, "kernelspec": { "display_name": "Python 3", @@ -1401,4 +1407,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} From 9cca92bb37edd7d43d3a937f2abb7c1706807056 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Thu, 8 Jun 2023 09:21:33 -0700 Subject: [PATCH 029/123] Upperbound Numpy to < 1.24 (#6829) Signed-off-by: smajumdar Co-authored-by: Eric Harper --- requirements/requirements.txt | 2 +- requirements/requirements_nlp.txt | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 9d4fab43186b..7481e337c999 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,6 +1,6 @@ huggingface_hub numba -numpy>=1.22 +numpy>=1.22,<1.24 onnx>=1.7.0 python-dateutil ruamel.yaml diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index b4e5ac5c5801..1ff4c444c2bf 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -14,7 +14,6 @@ markdown2 matplotlib>=3.3.2 megatron_core==0.1.0 nltk>=3.6.5 -numpy opencc pangu rapidfuzz From b67d41090ca3302d059a8f31e0dc9efa8b52b003 Mon Sep 17 00:00:00 2001 From: Vahid Noroozi Date: Thu, 8 Jun 2023 10:12:22 -0700 Subject: [PATCH 030/123] Multi-lookahead cache-aware streaming models (#6711) * added methods. Signed-off-by: Vahid * added methods. Signed-off-by: Vahid * added initial code. Signed-off-by: Vahid * added initial code. Signed-off-by: Vahid * added initial code. Signed-off-by: Vahid * added config files. Signed-off-by: Vahid * fixed bugs. Signed-off-by: Vahid * updated confs. Signed-off-by: Vahid * updated confs. Signed-off-by: Vahid * updated confs. Signed-off-by: Vahid * updated confs. Signed-off-by: Vahid * improved f.conv1d Signed-off-by: Vahid * pulled from main. Signed-off-by: Vahid * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * pulled from main. Signed-off-by: Vahid * added postpostnorm. Signed-off-by: Vahid * fixed the target continiouse bug. Signed-off-by: Vahid * added dw_striding causal. Signed-off-by: Vahid * added print for debugging. Signed-off-by: Vahid * added print for debugging. Signed-off-by: Vahid * fixed causal convolutions. Signed-off-by: Vahid * added _midnorm. Signed-off-by: Vahid * fixed transcribe. Signed-off-by: Vahid * cleaned code. Signed-off-by: Vahid * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * moved back configs. Signed-off-by: Vahid * moved back configs. Signed-off-by: Vahid * updated fast emit for FC models. Signed-off-by: Vahid * updated fast emit for FC models. Signed-off-by: Vahid * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed bug. Signed-off-by: Vahid * fixed bug and addressed comments. Signed-off-by: Vahid * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed configs. Signed-off-by: Vahid * fixed configs. Signed-off-by: Vahid * dropped the test. Signed-off-by: Vahid --------- Signed-off-by: Vahid Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../conformer_ctc_bpe_streaming.yaml | 8 +- .../conformer_transducer_bpe_streaming.yaml | 10 +- .../fastconformer_ctc_bpe_streaming.yaml | 9 +- .../fastconformer_ctc_char_streaming.yaml | 10 +- ...astconformer_transducer_bpe_streaming.yaml | 15 +- ...stconformer_transducer_char_streaming.yaml | 16 +- ...r_hybrid_transducer_ctc_bpe_streaming.yaml | 15 +- ..._hybrid_transducer_ctc_char_streaming.yaml | 15 +- .../asr/modules/conformer_encoder.py | 306 ++++++++++-------- .../asr/modules/squeezeformer_encoder.py | 1 - .../parts/submodules/multi_head_attention.py | 6 +- tests/collections/nlp/test_huggingface.py | 19 +- 12 files changed, 272 insertions(+), 158 deletions(-) diff --git a/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml b/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml index 98f23458cd86..32afd919a454 100644 --- a/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml @@ -103,10 +103,16 @@ model: # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - # for chunked_limited you may calculate the look-ahead or right context by the following formula: # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 27*4*0.01=1.08s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[140,27],[140,13],[140,2],[140,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] att_context_size: [140, 27] # -1 means unlimited context att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null xscaling: true # scales up the input embeddings by sqrt(d_model) untie_biases: true # unties the biases of the TransformerXL layers diff --git a/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml b/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml index 9d6e3a54d9fe..d55e5f927b2e 100644 --- a/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml @@ -113,10 +113,16 @@ model: # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - # for chunked_limited you may calculate the look-ahead or right context by the following formula: - # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 27*4*0.01=1.08s + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[140,27],[140,13],[140,2],[140,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] att_context_size: [140, 27] # -1 means unlimited context att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null xscaling: true # scales up the input embeddings by sqrt(d_model) untie_biases: true # unties the biases of the TransformerXL layers diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml index c68b30a33d5a..749216b1925d 100644 --- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml @@ -97,10 +97,17 @@ model: # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - # for chunked_limited you may calculate the look-ahead or right context by the following formula: # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] att_context_size: [70, 13] # -1 means unlimited context att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + xscaling: true # scales up the input embeddings by sqrt(d_model) pos_emb_max_len: 5000 diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml index 654895ec065d..17345119c529 100644 --- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml @@ -100,11 +100,19 @@ model: n_heads: 8 # may need to be lower for smaller d_models # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - # for att_context_style=regular, the right context is recommended to be a small number around 0 to 2 as multiple-layers may increase the effective right context too large + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] att_context_size: [70, 13] # -1 means unlimited context att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + xscaling: true # scales up the input embeddings by sqrt(d_model) pos_emb_max_len: 5000 diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml index 5f223061a420..dbd036458cb8 100644 --- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml @@ -102,10 +102,17 @@ model: # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - # for chunked_limited you may calculate the look-ahead or right context by the following formula: # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] att_context_size: [70, 13] # -1 means unlimited context att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + xscaling: true # scales up the input embeddings by sqrt(d_model) pos_emb_max_len: 5000 @@ -191,9 +198,9 @@ model: loss_name: "default" warprnnt_numba_kwargs: # FastEmit regularization: https://arxiv.org/abs/2010.11148 - # You may enable FastEmit to reduce the latency of the model for streaming - # It also helps to improve the accuracy of the model in streaming mode - fastemit_lambda: 1e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming + # You may set it to lower values like 1e-3 for models with larger right context + fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. optim: diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml index 68a78ba60aac..50f73d35ca75 100644 --- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml @@ -106,11 +106,19 @@ model: n_heads: 8 # may need to be lower for smaller d_models # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - # for att_context_style=regular, the right context is recommended to be a small number around 0 to 2 as multiple-layers may increase the effective right context too large + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] att_context_size: [70, 13] # -1 means unlimited context att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + xscaling: true # scales up the input embeddings by sqrt(d_model) pos_emb_max_len: 5000 @@ -196,9 +204,9 @@ model: loss_name: "default" warprnnt_numba_kwargs: # FastEmit regularization: https://arxiv.org/abs/2010.11148 - # You may enable FastEmit to reduce the latency of the model for streaming - # It also helps to improve the accuracy of the model in streaming mode - fastemit_lambda: 1e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming + # You may set it to lower values like 1e-3 for models with larger right context + fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. optim: diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml index 8b7a2ce7b39d..26dabaa039fe 100644 --- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml @@ -8,6 +8,8 @@ # FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml # FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml +# Note: if training loss does not converge, you may increase warm-up to 20K. + name: "FastConformer-Hybrid-Transducer-CTC-BPE-Streaming" model: @@ -106,8 +108,15 @@ model: # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] att_context_size: [70, 13] # -1 means unlimited context att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null xscaling: true # scales up the input embeddings by sqrt(d_model) pos_emb_max_len: 5000 @@ -206,9 +215,9 @@ model: loss_name: "default" warprnnt_numba_kwargs: # FastEmit regularization: https://arxiv.org/abs/2010.11148 - # You may enable FastEmit to reduce the latency of the model for streaming - # It also helps to improve the accuracy of the model in streaming mode - fastemit_lambda: 1e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming + # You may set it to lower values like 1e-3 for models with larger right context + fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. optim: diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml index a24829b50788..d8362636f04a 100644 --- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml +++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml @@ -8,6 +8,8 @@ # FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml # FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml +# Note: if training loss does not converge, you may increase warm-up to 20K. + name: "FastConformer-Hybrid-Transducer-CTC-Char-Streaming" model: @@ -111,8 +113,15 @@ model: # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] att_context_size: [70, 13] # -1 means unlimited context att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null xscaling: true # scales up the input embeddings by sqrt(d_model) pos_emb_max_len: 5000 @@ -211,9 +220,9 @@ model: loss_name: "default" warprnnt_numba_kwargs: # FastEmit regularization: https://arxiv.org/abs/2010.11148 - # You may enable FastEmit to reduce the latency of the model for streaming - # It also helps to improve the accuracy of the model in streaming mode - fastemit_lambda: 1e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming + # You may set it to lower values like 1e-3 for models with larger right context + fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. optim: diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py index 7c786f9c9720..74c255741039 100644 --- a/nemo/collections/asr/modules/conformer_encoder.py +++ b/nemo/collections/asr/modules/conformer_encoder.py @@ -13,6 +13,7 @@ # limitations under the License. import math +import random from collections import OrderedDict from dataclasses import dataclass from typing import List, Optional, Set @@ -89,9 +90,13 @@ class ConformerEncoder(NeuralModule, StreamingEncoder, Exportable, AccessMixin): Defaults to 5000 n_heads (int): number of heads in multi-headed attention layers Defaults to 4. - att_context_size (List[int]): List of 2 ints corresponding to left and right attention context sizes, - or None for full context. - Defaults to None. + att_context_size (List[Union[List[int],int]]): specifies the context sizes on each side. Each context size should be a list of two integers like [100,100]. + A list of context sizes like [[100,100],[100,50]] can also be passed. -1 means unlimited context. + Defaults to [-1,-1] + att_context_probs (List[float]): a list of probabilities of each one of the att_context_size when a list of them is passed. If not specified, uniform distribution is being used. + Defaults to None + att_context_style (str): 'regular' or 'chunked_limited'. + Defaults to 'regular' xscaling (bool): enables scaling the inputs to the multi-headed attention layers by sqrt(d_model) Defaults to True. untie_biases (bool): whether to not share (untie) the bias weights between layers of Transformer-XL @@ -100,6 +105,11 @@ class ConformerEncoder(NeuralModule, StreamingEncoder, Exportable, AccessMixin): Defaults to 31. conv_norm_type (str): the type of the normalization in the convolutional modules Defaults to 'batch_norm'. + conv_context_size (list): it can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size. + None means [(conv_kernel_size-1)//2, (conv_kernel_size-1)//2], and 'causal' means [(conv_kernel_size-1), 0]. + Defaults to None. + conv_dual_mode (bool): specifies if convolution should be dual mode when dual_offline mode is being used. When enables, the left half of the convolution kernel would get masked in streaming cases. + Defaults to False dropout (float): the dropout rate used in all layers except the attention layers Defaults to 0.1. dropout_pre_encoder (float): the dropout rate used before the encoder @@ -256,6 +266,7 @@ def __init__( self_attention_model='rel_pos', n_heads=4, att_context_size=None, + att_context_probs=None, att_context_style='regular', xscaling=True, untie_biases=True, @@ -279,7 +290,6 @@ def __init__( self.d_model = d_model self.n_layers = n_layers self._feat_in = feat_in - self.scale = math.sqrt(self.d_model) self.att_context_style = att_context_style self.subsampling_factor = subsampling_factor self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor @@ -289,51 +299,19 @@ def __init__( self.global_attn_separate = global_attn_separate self.global_tokens_spacing = global_tokens_spacing - if att_context_size: - self.att_context_size = list(att_context_size) - else: - self.att_context_size = [-1, -1] - - if isinstance(conv_context_size, ListConfig): - conv_context_size = list(conv_context_size) - - if conv_context_size is not None: - if ( - not isinstance(conv_context_size, list) - and not isinstance(conv_context_size, str) - and not isinstance(conv_context_size, ListConfig) - ): - raise ValueError( - f"Invalid conv_context_size! It should be the string 'causal' or a list of two integers." - ) - if conv_context_size == "causal": - conv_context_size = [conv_kernel_size - 1, 0] - else: - if conv_context_size[0] + conv_context_size[1] + 1 != conv_kernel_size: - raise ValueError(f"Invalid conv_context_size: {self.conv_context_size}!") - else: - conv_context_size = [(conv_kernel_size - 1) // 2, (conv_kernel_size - 1) // 2] - self.conv_context_size = conv_context_size - - if att_context_style == "chunked_limited": - # the left context for self-attention in chunked_limited mode should be dividable by the right context - # right context=att_context_size[1]+1, and left_context=self.att_context_size[0] - if self.att_context_size[0] > 0 and self.att_context_size[0] % (self.att_context_size[1] + 1) > 0: - raise ValueError("att_context_size[0] % (att_context_size[1] + 1) should be zero!") - if self.att_context_size[1] < 0: - raise ValueError("Right context can not be unlimited for chunked_limited style!") - self.chunk_size = self.att_context_size[1] + 1 - - # left_chunks_num specifies the number of chunks to be visible by each chunk on the left side - if self.att_context_size[0] >= 0: - self.left_chunks_num = self.att_context_size[0] // self.chunk_size - else: - self.left_chunks_num = 100000 - - elif att_context_style == "regular": - self.chunk_size = None - else: - raise ValueError("Invalid att_context_style!") + # Setting up the att_context_size + ( + self.att_context_size_all, + self.att_context_size, + self.att_context_probs, + self.conv_context_size, + ) = self._calc_context_sizes( + att_context_style=att_context_style, + att_context_size=att_context_size, + att_context_probs=att_context_probs, + conv_context_size=conv_context_size, + conv_kernel_size=conv_kernel_size, + ) if xscaling: self.xscale = math.sqrt(d_model) @@ -379,6 +357,7 @@ def __init__( self._feat_out = d_model + # Biases for relative positional encoding if not untie_biases and self_attention_model == "rel_pos": d_head = d_model // n_heads pos_bias_u = nn.Parameter(torch.Tensor(n_heads, d_head)) @@ -389,8 +368,8 @@ def __init__( pos_bias_u = None pos_bias_v = None + # Positional encodings self.pos_emb_max_len = pos_emb_max_len - self.att_mask = None if self_attention_model == "rel_pos": self.pos_enc = RelPositionalEncoding( d_model=d_model, @@ -458,51 +437,6 @@ def __init__( # will be set in self.forward() if defined in AccessMixin config self.interctc_capture_at_layers = None - def update_max_seq_length(self, seq_length: int, device): - # Find global max audio length across all nodes - if torch.distributed.is_initialized(): - global_max_len = torch.tensor([seq_length], dtype=torch.float32, device=device) - - # Update across all ranks in the distributed system - torch.distributed.all_reduce(global_max_len, op=torch.distributed.ReduceOp.MAX) - - seq_length = global_max_len.to(torch.int64).item() - - if seq_length > self.max_audio_length: - self.set_max_audio_length(seq_length) - - def set_max_audio_length(self, max_audio_length): - """ - Sets maximum input length. - Pre-calculates internal seq_range mask. - """ - self.max_audio_length = max_audio_length - device = next(self.parameters()).device - self.pos_enc.extend_pe(max_audio_length, device) - - if self.self_attention_model != "rel_pos_local_attn": - att_mask = torch.ones(1, max_audio_length, max_audio_length, dtype=torch.bool, device=device) - if self.chunk_size is None: - if self.att_context_size[0] >= 0: - att_mask = att_mask.triu(diagonal=-self.att_context_size[0]) - if self.att_context_size[1] >= 0: - att_mask = att_mask.tril(diagonal=self.att_context_size[1]) - else: - chunk_idx = torch.arange(0, max_audio_length, dtype=torch.int64, device=att_mask.device) - chunk_idx = torch.div(chunk_idx, self.chunk_size, rounding_mode="trunc") - diff_chunks = chunk_idx.unsqueeze(1) - chunk_idx.unsqueeze(0) - chunked_limited_mask = torch.logical_and( - torch.le(diff_chunks, self.left_chunks_num), torch.ge(diff_chunks, 0) - ) - att_mask = torch.logical_and(att_mask, chunked_limited_mask.unsqueeze(0)) - - if hasattr(self, 'att_mask'): - self.att_mask = att_mask - else: - self.register_buffer('att_mask', att_mask, persistent=False) - else: - self.att_mask = None - def forward_for_export( self, audio_signal, length, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None ): @@ -565,17 +499,24 @@ def forward_internal( self, audio_signal, length, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None ): self.update_max_seq_length(seq_length=audio_signal.size(2), device=audio_signal.device) - max_audio_length = audio_signal.size(-1) if length is None: length = audio_signal.new_full( - (audio_signal.size(0),), max_audio_length, dtype=torch.int64, device=audio_signal.device + (audio_signal.size(0),), audio_signal.size(-1), dtype=torch.int64, device=audio_signal.device ) if cache_last_time is not None: cache_last_time_next = torch.zeros_like(cache_last_time) else: cache_last_time_next = None + + # select a random att_context_size with the distribution specified by att_context_probs during training + # for non-validation cases like test, validation or inference, it uses the first mode in self.att_context_size + if self.training and len(self.att_context_size_all) > 1: + cur_att_context_size = random.choices(self.att_context_size_all, weights=self.att_context_probs)[0] + else: + cur_att_context_size = self.att_context_size + audio_signal = torch.transpose(audio_signal, 1, 2) if isinstance(self.pre_encode, nn.Linear): @@ -588,11 +529,10 @@ def forward_internal( audio_signal = audio_signal[:, self.streaming_cfg.drop_extra_pre_encoded :, :] length = (length - self.streaming_cfg.drop_extra_pre_encoded).clamp(min=0) - max_audio_length = audio_signal.size(1) - if self.reduction_position is not None and cache_last_channel is not None: raise ValueError("Caching with reduction feature is not supported yet!") + max_audio_length = audio_signal.size(1) if cache_last_channel is not None: cache_len = self.streaming_cfg.last_channel_cache_size cache_keep_size = max_audio_length - self.streaming_cfg.cache_drop_size @@ -606,17 +546,20 @@ def forward_internal( cache_len = 0 offset = None - if self.self_attention_model == 'abs_pos': - audio_signal, pos_emb = self.pos_enc(x=audio_signal) - else: - audio_signal, pos_emb = self.pos_enc(x=audio_signal, cache_len=cache_len) + audio_signal, pos_emb = self.pos_enc(x=audio_signal, cache_len=cache_len) # Create the self-attention and padding masks - pad_mask, att_mask = self._create_masks(max_audio_length, padding_length, offset, audio_signal.device) + pad_mask, att_mask = self._create_masks( + att_context_size=cur_att_context_size, + padding_length=padding_length, + max_audio_length=max_audio_length, + offset=offset, + device=audio_signal.device, + ) if cache_last_channel is not None: pad_mask = pad_mask[:, cache_len:] - if self.att_mask is not None: + if att_mask is not None: att_mask = att_mask[:, cache_len:] for lth, (drop_prob, layer) in enumerate(zip(self.layer_drop_probs, self.layers)): @@ -650,7 +593,13 @@ def forward_internal( # Don't update the audio_signal here because then it will again scale the audio_signal # and cause an increase in the WER _, pos_emb = self.pos_enc(x=audio_signal, cache_len=cache_len) - pad_mask, att_mask = self._create_masks(max_audio_length, length, offset, audio_signal.device) + pad_mask, att_mask = self._create_masks( + att_context_size=cur_att_context_size, + padding_length=length, + max_audio_length=max_audio_length, + offset=offset, + device=audio_signal.device, + ) # saving tensors if required for interctc loss if self.is_access_enabled(): @@ -687,7 +636,60 @@ def forward_internal( else: return audio_signal, length - def _create_masks(self, max_audio_length, padding_length, offset, device): + def update_max_seq_length(self, seq_length: int, device): + # Find global max audio length across all nodes + if torch.distributed.is_initialized(): + global_max_len = torch.tensor([seq_length], dtype=torch.float32, device=device) + + # Update across all ranks in the distributed system + torch.distributed.all_reduce(global_max_len, op=torch.distributed.ReduceOp.MAX) + + seq_length = global_max_len.int().item() + + if seq_length > self.max_audio_length: + self.set_max_audio_length(seq_length) + + def set_max_audio_length(self, max_audio_length): + """ + Sets maximum input length. + Pre-calculates internal seq_range mask. + """ + self.max_audio_length = max_audio_length + device = next(self.parameters()).device + self.pos_enc.extend_pe(max_audio_length, device) + + def _create_masks(self, att_context_size, padding_length, max_audio_length, offset, device): + if self.self_attention_model != "rel_pos_local_attn": + att_mask = torch.ones(1, max_audio_length, max_audio_length, dtype=torch.bool, device=device) + + if self.att_context_style == "regular": + if att_context_size[0] >= 0: + att_mask = att_mask.triu(diagonal=-att_context_size[0]) + if att_context_size[1] >= 0: + att_mask = att_mask.tril(diagonal=att_context_size[1]) + elif self.att_context_style == "chunked_limited": + # When right context is unlimited, just the left side of the masking need to get updated + if att_context_size[1] == -1: + if att_context_size[0] >= 0: + att_mask = att_mask.triu(diagonal=-att_context_size[0]) + else: + chunk_size = att_context_size[1] + 1 + # left_chunks_num specifies the number of chunks to be visible by each chunk on the left side + if att_context_size[0] >= 0: + left_chunks_num = att_context_size[0] // chunk_size + else: + left_chunks_num = 10000 + + chunk_idx = torch.arange(0, max_audio_length, dtype=torch.int, device=att_mask.device) + chunk_idx = torch.div(chunk_idx, chunk_size, rounding_mode="trunc") + diff_chunks = chunk_idx.unsqueeze(1) - chunk_idx.unsqueeze(0) + chunked_limited_mask = torch.logical_and( + torch.le(diff_chunks, left_chunks_num), torch.ge(diff_chunks, 0) + ) + att_mask = torch.logical_and(att_mask, chunked_limited_mask.unsqueeze(0)) + else: + att_mask = None + # pad_mask is the masking to be used to ignore paddings pad_mask = torch.arange(0, max_audio_length, device=device).expand( padding_length.size(0), -1 @@ -697,24 +699,19 @@ def _create_masks(self, max_audio_length, padding_length, offset, device): pad_mask_off = torch.arange(0, max_audio_length, device=device).expand( padding_length.size(0), -1 ) >= offset.unsqueeze(-1) - pad_mask = pad_mask_off.logical_and(pad_mask) - if self.att_mask is not None: + if att_mask is not None: # pad_mask_for_att_mask is the mask which helps to ignore paddings pad_mask_for_att_mask = pad_mask.unsqueeze(1).repeat([1, max_audio_length, 1]) pad_mask_for_att_mask = torch.logical_and(pad_mask_for_att_mask, pad_mask_for_att_mask.transpose(1, 2)) # att_mask is the masking to be used by the MHA layers to ignore the tokens not supposed to be visible - att_mask = self.att_mask[:, :max_audio_length, :max_audio_length] + att_mask = att_mask[:, :max_audio_length, :max_audio_length] # paddings should also get ignored, so pad_mask_for_att_mask is used to ignore their corresponding scores att_mask = torch.logical_and(pad_mask_for_att_mask, att_mask.to(pad_mask_for_att_mask.device)) - att_mask = ~att_mask - else: - att_mask = None pad_mask = ~pad_mask - return pad_mask, att_mask def enable_pad_mask(self, on=True): @@ -723,8 +720,64 @@ def enable_pad_mask(self, on=True): self.use_pad_mask = on return mask + def _calc_context_sizes( + self, att_context_size, att_context_probs, att_context_style, conv_context_size, conv_kernel_size + ): + # convert att_context_size to a standard list of lists + if att_context_size: + att_context_size_all = list(att_context_size) + if isinstance(att_context_size_all[0], int): + att_context_size_all = [att_context_size_all] + for i, att_cs in enumerate(att_context_size_all): + if isinstance(att_cs, ListConfig): + att_context_size_all[i] = list(att_cs) + if att_context_style == "chunked_limited": + if att_cs[0] > 0 and att_cs[0] % (att_cs[1] + 1) > 0: + raise ValueError(f"att_context_size[{i}][0] % (att_context_size[{i}][1] + 1) should be zero!") + if att_cs[1] < 0 and len(att_context_size_all) <= 1: + raise ValueError( + f"Right context (att_context_size[{i}][1]) can not be unlimited for chunked_limited style!" + ) + else: + att_context_size_all = [[-1, -1]] + + if att_context_probs: + if len(att_context_probs) != len(att_context_size_all): + raise ValueError("The size of the att_context_probs should be the same as att_context_size.") + att_context_probs = list(att_context_probs) + if sum(att_context_probs) != 1: + raise ValueError( + "The sum of numbers in att_context_probs should be equal to one to be a distribution." + ) + else: + att_context_probs = [1.0 / len(att_context_size_all)] * len(att_context_size_all) + + if conv_context_size is not None: + if isinstance(conv_context_size, ListConfig): + conv_context_size = list(conv_context_size) + if not isinstance(conv_context_size, list) and not isinstance(conv_context_size, str): + raise ValueError( + f"Invalid conv_context_size! It should be the string 'causal' or a list of two integers." + ) + if conv_context_size == "causal": + conv_context_size = [conv_kernel_size - 1, 0] + else: + if conv_context_size[0] + conv_context_size[1] + 1 != conv_kernel_size: + raise ValueError(f"Invalid conv_context_size: {self.conv_context_size}!") + else: + conv_context_size = [(conv_kernel_size - 1) // 2, (conv_kernel_size - 1) // 2] + return att_context_size_all, att_context_size_all[0], att_context_probs, conv_context_size + + def set_default_att_context_size(self, att_context_size): + self.att_context_size = att_context_size + def setup_streaming_params( - self, chunk_size: int = None, shift_size: int = None, left_chunks: int = None, max_context: int = 10000 + self, + chunk_size: int = None, + shift_size: int = None, + left_chunks: int = None, + att_context_size: list = None, + max_context: int = 10000, ): """ This function sets the needed values and parameters to perform streaming. The configuration would be stored in self.streaming_cfg. @@ -737,25 +790,28 @@ def setup_streaming_params( Defaults to -1 (means feat_out is d_model) """ streaming_cfg = CacheAwareStreamingConfig() + + # When att_context_size is not specified, it uses the default_att_context_size + if att_context_size is None: + att_context_size = self.att_context_size + if chunk_size is not None: if chunk_size < 1: raise ValueError("chunk_size needs to be a number larger or equal to one.") lookahead_steps = chunk_size - 1 streaming_cfg.cache_drop_size = chunk_size - shift_size elif self.att_context_style == "chunked_limited": - lookahead_steps = self.att_context_size[1] + lookahead_steps = att_context_size[1] streaming_cfg.cache_drop_size = 0 elif self.att_context_style == "regular": - lookahead_steps = self.att_context_size[1] * self.n_layers + self.conv_context_size[1] * self.n_layers + lookahead_steps = att_context_size[1] * self.n_layers + self.conv_context_size[1] * self.n_layers streaming_cfg.cache_drop_size = lookahead_steps else: streaming_cfg.cache_drop_size = 0 lookahead_steps = None if chunk_size is None: - streaming_cfg.last_channel_cache_size = ( - self.att_context_size[0] if self.att_context_size[0] >= 0 else max_context - ) + streaming_cfg.last_channel_cache_size = att_context_size[0] if att_context_size[0] >= 0 else max_context else: if left_chunks is None: raise ValueError("left_chunks can not be None when chunk_size is set.") @@ -878,9 +934,9 @@ def change_attention_model( 'rel_pos_local_attn': relative positional embedding and Transformer-XL with local attention using overlapping windows. Attention context is determined by att_context_size parameter. 'abs_pos': absolute positional embedding and Transformer - If None is provided, the self_attention_model isn't changed. Defauts to None. + If None is provided, the self_attention_model isn't changed. Defaults to None. att_context_size (List[int]): List of 2 ints corresponding to left and right attention context sizes, - or None to keep as it is. Defauts to None. + or None to keep as it is. Defaults to None. update_config (bool): Whether to update the config or not with the new attention model. Defaults to True. device (torch.device): If provided, new layers will be moved to the device. @@ -889,19 +945,16 @@ def change_attention_model( if att_context_size: att_context_size = list(att_context_size) - elif hasattr(self._cfg, "att_context_size"): - att_context_size = self._cfg.att_context_size else: att_context_size = self.att_context_size if self_attention_model is None: - self_attention_model = self._cfg.self_attention_model + self_attention_model = self.self_attention_model if self_attention_model == 'rel_pos_local_attn' and max(att_context_size) <= 0: raise ValueError("When using local attention, context size must be set > 0") if self_attention_model == "rel_pos": - self.att_mask = None new_pos_enc = RelPositionalEncoding( d_model=self._cfg.d_model, dropout_rate=self._cfg.dropout, @@ -938,7 +991,6 @@ def change_attention_model( for name, m in self.named_modules(): if type(m) == ConformerLayer: - if self_attention_model == 'rel_pos': new_attn = RelPositionMultiHeadAttention( n_head=self._cfg.n_heads, diff --git a/nemo/collections/asr/modules/squeezeformer_encoder.py b/nemo/collections/asr/modules/squeezeformer_encoder.py index 952c9b53d233..a887abd19ebb 100644 --- a/nemo/collections/asr/modules/squeezeformer_encoder.py +++ b/nemo/collections/asr/modules/squeezeformer_encoder.py @@ -149,7 +149,6 @@ def __init__( d_ff = d_model * ff_expansion_factor self.d_model = d_model self._feat_in = feat_in - self.scale = math.sqrt(self.d_model) if att_context_size: self.att_context_size = att_context_size else: diff --git a/nemo/collections/asr/parts/submodules/multi_head_attention.py b/nemo/collections/asr/parts/submodules/multi_head_attention.py index 40baf1141bd3..b7356ffe87e4 100644 --- a/nemo/collections/asr/parts/submodules/multi_head_attention.py +++ b/nemo/collections/asr/parts/submodules/multi_head_attention.py @@ -888,17 +888,19 @@ def extend_pe(self, length, device): positions = torch.arange(0, length, dtype=torch.float32, device=device).unsqueeze(1) self.create_pe(positions=positions) - def forward(self, x: torch.Tensor): + def forward(self, x: torch.Tensor, cache_len=0): """Adds positional encoding. Args: x (torch.Tensor): Input. Its shape is (batch, time, feature_size) + cache_len (int): the size of the cache which is used to shift positions Returns: x+pos_emb (torch.Tensor): Its shape is (batch, time, feature_size) pos_emb (torch.Tensor): Its shape is (1, time, feature_size) """ + input_len = x.size(1) + cache_len if self.xscale: x = x * self.xscale - pos_emb = self.pe[:, : x.size(1)] + pos_emb = self.pe[:, :input_len] if self.dropout_emb: pos_emb = self.dropout_emb(pos_emb) x = x + pos_emb diff --git a/tests/collections/nlp/test_huggingface.py b/tests/collections/nlp/test_huggingface.py index cfe2845caa9b..0ad7b5850475 100644 --- a/tests/collections/nlp/test_huggingface.py +++ b/tests/collections/nlp/test_huggingface.py @@ -85,12 +85,13 @@ def test_get_pretrained_chinese_bert_wwm_model(self): tokenizer = get_tokenizer(tokenizer_name=model_name) assert isinstance(tokenizer, AutoTokenizer) - @pytest.mark.with_downloads() - @pytest.mark.unit - def test_get_pretrained_arabic_model(self): - model_name = 'asafaya/bert-base-arabic' - self.omega_conf.language_model.pretrained_model_name = model_name - model = nemo_nlp.modules.get_lm_model(cfg=self.omega_conf) - assert isinstance(model, nemo_nlp.modules.BertModule) - tokenizer = get_tokenizer(tokenizer_name=model_name) - assert isinstance(tokenizer, AutoTokenizer) + # model is not on HF anymore + # @pytest.mark.with_downloads() + # @pytest.mark.unit + # def test_get_pretrained_arabic_model(self): + # model_name = 'asafaya/bert-base-arabic' + # self.omega_conf.language_model.pretrained_model_name = model_name + # model = nemo_nlp.modules.get_lm_model(cfg=self.omega_conf) + # assert isinstance(model, nemo_nlp.modules.BertModule) + # tokenizer = get_tokenizer(tokenizer_name=model_name) + # assert isinstance(tokenizer, AutoTokenizer) From 3ed3c4e7d251f32b2be5c835a89e6ab2f389a056 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Thu, 8 Jun 2023 22:33:01 +0300 Subject: [PATCH 031/123] added changes to ramp up bs (#6799) * rampup bs changes Signed-off-by: dimapihtar * rampup bs changes Signed-off-by: dimapihtar * fixed styling Signed-off-by: dimapihtar * fix bug Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> --------- Signed-off-by: dimapihtar Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Eric Harper --- .../megatron/data_samplers.py | 3 +- .../language_modeling/megatron_base_model.py | 18 ++++-- .../language_modeling/megatron_gpt_model.py | 55 +++++++++---------- 3 files changed, 41 insertions(+), 35 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py index edc58ee999c2..7df915533492 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py @@ -33,6 +33,7 @@ def __init__( data_parallel_size: int, drop_last: bool = True, global_batch_size: Optional[int] = None, + rampup_batch_size: Optional[list] = None, pad_samples_to_global_batch_size: Optional[bool] = False, ) -> None: # Sanity checks. @@ -50,7 +51,7 @@ def __init__( data_parallel_rank, data_parallel_size ) ) - if global_batch_size is not None: + if global_batch_size is not None and rampup_batch_size is None: if global_batch_size % (micro_batch_size * data_parallel_size) != 0: raise RuntimeError( f"`global_batch_size` ({global_batch_size}) is not divisible by " diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 2aaedbe5a806..2568a14f8dbf 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -481,10 +481,20 @@ def configure_optimizers(self): def compute_consumed_samples(self, steps_since_resume=0): app_state = AppState() - consumed_samples = ( - self.init_consumed_samples - + steps_since_resume * app_state.data_parallel_size * self.cfg.micro_batch_size * get_num_microbatches() - ) + + if self.cfg.get('rampup_batch_size', None): + from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR + + current_global_batch_size = getattr(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, 'current_global_batch_size', 1) + consumed_samples = self.prev_consumed_samples + self.if_first_step * current_global_batch_size + else: + consumed_samples = ( + self.init_consumed_samples + + steps_since_resume + * app_state.data_parallel_size + * self.cfg.micro_batch_size + * get_num_microbatches() + ) return int(consumed_samples) def _extract_consumed_samples_from_ckpt(self, ckpt_path): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 3530ffcfc371..8eff896cf9d8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -208,6 +208,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) + self.rampup_batch_size = self.cfg.get('rampup_batch_size', None) + if self.rampup_batch_size: + self.prev_consumed_samples = 0 + self.if_first_step = 0 + self.prev_global_batch_size = None + if not self.megatron_amp_o2 and self.cfg.get('virtual_pipeline_model_parallel_size', None): raise ValueError('Virtual pipeline model parallel is only supported when using megatron_amp_O2') @@ -507,6 +513,13 @@ def training_step(self, dataloader_iter, batch_idx): The input batch to each micro-batch is fetched using the dataloader function in the micro-batch fwd function. """ + if self.rampup_batch_size: + num_microbatch_calculator = apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR + current_global_batch_size = num_microbatch_calculator.current_global_batch_size + logging.info(current_global_batch_size) + # do validation and save the checkpoint when gbs is changed + if self.prev_global_batch_size != current_global_batch_size and self.prev_global_batch_size: + self.trainer.should_stop = True # we zero grads here because we also call backward in the megatron-core fwd/bwd functions self._optimizer.zero_grad() @@ -580,16 +593,15 @@ def training_step(self, dataloader_iter, batch_idx): 'consumed_samples', consumed_samples, prog_bar=True, rank_zero_only=True, batch_size=1, ) - if self.cfg.get('rampup_batch_size', None): - micro_batch_size = self.cfg.get('micro_batch_size', 1) - total_gpus_number = self.trainer.num_devices * self.trainer.num_nodes - current_global_batch_size = get_num_microbatches() * micro_batch_size * total_gpus_number - self.log('global_batch_size', current_global_batch_size, prog_bar=True, rank_zero_only=True, batch_size=1) - - num_microbatch_calculator = apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR + if self.rampup_batch_size: + self.prev_global_batch_size = current_global_batch_size + self.prev_consumed_samples = consumed_samples num_microbatch_calculator.update( - consumed_samples=consumed_samples, consistency_check=True, + consumed_samples=consumed_samples, consistency_check=False, ) + current_global_batch_size = num_microbatch_calculator.current_global_batch_size + self.log('global_batch_size', current_global_batch_size, prog_bar=True, rank_zero_only=True, batch_size=1) + self.if_first_step = 1 return loss_mean @@ -936,6 +948,7 @@ def build_pretraining_data_loader( data_parallel_size=parallel_state.get_data_parallel_world_size(), drop_last=drop_last, global_batch_size=self.cfg.global_batch_size, + rampup_batch_size=self.cfg.rampup_batch_size, pad_samples_to_global_batch_size=pad_samples_to_global_batch_size, ) elif self.cfg.data.dataloader_type == 'cyclic': @@ -986,28 +999,10 @@ def setup(self, stage=None): self.init_consumed_samples = init_consumed_samples self.init_global_step = self.trainer.global_step - rampup_batch_size = self.cfg.get('rampup_batch_size', None) - if rampup_batch_size: - start_batch_size = rampup_batch_size[0] - batch_size_increment = rampup_batch_size[1] - total_gpus_number = self.trainer.num_devices * self.trainer.num_nodes - - assert start_batch_size % (total_gpus_number) == 0, ( - 'expected' - ' start batch size ({}) to be divisible by total number of GPUs' - ' ({})'.format(start_batch_size, total_gpus_number) - ) - - micro_batch_size = self.cfg.get('micro_batch_size', 1) - tensor_model_parallel_size = self.cfg.get('tensor_model_parallel_size', 1) - pipeline_model_parallel_size = self.cfg.get('pipeline_model_parallel_size', 1) - total_data_parallel_size = total_gpus_number // (tensor_model_parallel_size * pipeline_model_parallel_size) - - assert batch_size_increment % (micro_batch_size * total_data_parallel_size) == 0, ( - 'expected' - ' batch size increment ({}) to be divisible by micro_batch_size ({}) times total data parallel size' - ' ({})'.format(batch_size_increment, micro_batch_size, total_data_parallel_size) - ) + if self.rampup_batch_size: + num_microbatch_calculator = apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR + num_microbatch_calculator.update(self.init_consumed_samples, consistency_check=False) + self.prev_consumed_samples = self.init_consumed_samples if stage == 'predict': return From 7538a08affeb1d48286d1643b60a91429ba01efe Mon Sep 17 00:00:00 2001 From: Dounx Date: Fri, 9 Jun 2023 03:45:04 +0800 Subject: [PATCH 032/123] Fix typo in core.rst (#6838) Signed-off-by: Dounx --- docs/source/core/core.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst index 4f5589653172..7b2edfa0f5c4 100644 --- a/docs/source/core/core.rst +++ b/docs/source/core/core.rst @@ -38,7 +38,7 @@ To see all available pretrained models for a specific NeMo model, use the ``list .. code-block:: Python - nemo_asr.model.EncDecCTCModel.list_available_models() + nemo_asr.models.EncDecCTCModel.list_available_models() For detailed information on the available pretrained models, refer to the collections documentation: From 014fa02bef4355275d7d21ec58ce91788460426b Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Thu, 8 Jun 2023 13:48:59 -0700 Subject: [PATCH 033/123] add back ptuning pp2 test (#6394) Signed-off-by: arendu --- Jenkinsfile | 87 ++++++++++++++++++++++++++--------------------------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 27537b53a557..d16379cabb8a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -3588,50 +3588,49 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' } } - // TODO: add when https://github.com/NVIDIA/apex/pull/1596 is merged - // stage('L2: Megatron GPT Prompt Tuning TP1 PP2') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel{ - // stage('GPT Prompt Learning TP=1 PP=2') { - // steps { - // sh "python examples/nlp/language_modeling/megatron_gpt_prompt_learning.py \ - // --config-name=megatron_gpt_prompt_learning_config \ - // name='/home/TestData/nlp/prompt_learning/p_tuning_test_pp' \ - // trainer.devices=2 \ - // trainer.max_steps=1 \ - // trainer.val_check_interval=1 \ - // trainer.max_epochs=null \ - // model.optim.name=fused_adam \ - // model.data.num_workers=1 \ - // model.pipeline_model_parallel_size=2 \ - // model.language_model_path='/home/TestData/nlp/megatron_gpt/tiny/megatron_14m_gpt_tp1_pp2.nemo' \ - // model.existing_tasks=[] \ - // model.new_tasks=['boolq'] \ - // model.data.train_ds=['/home/TestData/nlp/prompt_learning/boolq_CI_test.jsonl'] \ - // model.data.validation_ds=['/home/TestData/nlp/prompt_learning/boolq_CI_test.jsonl'] \ - // model.global_batch_size=4" - // sh "rm -rf /home/TestData/nlp/prompt_learning/p_tuning_test_pp" - // sh "python examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py \ - // virtual_prompt_model_file='/home/TestData/nlp/prompt_learning/p_tuning_test_pp.nemo' \ - // gpt_model_file='/home/TestData/nlp/megatron_gpt/tiny/megatron_14m_gpt_tp1_pp2.nemo' \ - // inference.greedy=True \ - // inference.add_BOS=False \ - // trainer.devices=2 \ - // pipeline_model_parallel_size=2 \ - // pred_file_path=/home/TestData/nlp/prompt_learning/p_tuning_test_pp_preds.txt \ - // data_paths=['/home/TestData/nlp/prompt_learning/boolq_CI_test.jsonl']" - // sh "rm -rf /home/TestData/nlp/prompt_learning/p_tuning_test_pp.nemo" - // sh "rm -rf /home/TestData/nlp/prompt_learning/p_tuning_test_pp_preds.txt" - // } - // } - // } - // } + stage('L2: Megatron GPT Prompt Tuning TP1 PP2') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + parallel{ + stage('GPT Prompt Learning TP=1 PP=2') { + steps { + sh "python examples/nlp/language_modeling/megatron_gpt_prompt_learning.py \ + --config-name=megatron_gpt_prompt_learning_config \ + name='/home/TestData/nlp/prompt_learning/p_tuning_test_pp' \ + trainer.devices=2 \ + trainer.max_steps=1 \ + trainer.val_check_interval=1 \ + trainer.max_epochs=null \ + model.optim.name=fused_adam \ + model.data.num_workers=1 \ + model.pipeline_model_parallel_size=2 \ + model.language_model_path='/home/TestData/nlp/megatron_gpt/tiny/megatron_14m_gpt_tp1_pp2.nemo' \ + model.existing_tasks=[] \ + model.new_tasks=['boolq'] \ + model.data.train_ds=['/home/TestData/nlp/prompt_learning/boolq_CI_test.jsonl'] \ + model.data.validation_ds=['/home/TestData/nlp/prompt_learning/boolq_CI_test.jsonl'] \ + model.global_batch_size=4" + sh "rm -rf /home/TestData/nlp/prompt_learning/p_tuning_test_pp" + sh "python examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py \ + virtual_prompt_model_file='/home/TestData/nlp/prompt_learning/p_tuning_test_pp.nemo' \ + gpt_model_file='/home/TestData/nlp/megatron_gpt/tiny/megatron_14m_gpt_tp1_pp2.nemo' \ + inference.greedy=True \ + inference.add_BOS=False \ + trainer.devices=2 \ + pipeline_model_parallel_size=2 \ + pred_file_path=/home/TestData/nlp/prompt_learning/p_tuning_test_pp_preds.txt \ + data_paths=['/home/TestData/nlp/prompt_learning/boolq_CI_test.jsonl']" + sh "rm -rf /home/TestData/nlp/prompt_learning/p_tuning_test_pp.nemo" + sh "rm -rf /home/TestData/nlp/prompt_learning/p_tuning_test_pp_preds.txt" + } + } + } + } // TODO: Add this test back. Test was failing on CI machines due to HW error // stage('L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval') { From 29ceec05edb8ad0d8e88896c85f6eac7055c3bd3 Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Thu, 8 Jun 2023 15:21:16 -0700 Subject: [PATCH 034/123] t5 lora tuning (#6612) * t5 lora Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * eval lora t5 Signed-off-by: arendu * adjust differernt lora dims Signed-off-by: arendu * minor changes Signed-off-by: David Mosallanezhad * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * bugfix for state_dict Signed-off-by: David Mosallanezhad --------- Signed-off-by: arendu Signed-off-by: David Mosallanezhad Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: David Mosallanezhad Co-authored-by: David --- .../conf/megatron_t5_lora_inference.yaml | 36 ++++ .../conf/megatron_t5_lora_tuning_config.yaml | 99 +++++++++++ .../tuning/megatron_t5_lora_eval.py | 160 ++++++++++++++++++ .../tuning/megatron_t5_lora_tuning.py | 107 ++++++++++++ .../megatron_t5_adapter_model.py | 129 ++++++++++++++ .../megatron/adapters/parallel_adapters.py | 30 +++- .../nlp/modules/common/megatron/attention.py | 21 ++- 7 files changed, 580 insertions(+), 2 deletions(-) create mode 100644 examples/nlp/language_modeling/tuning/conf/megatron_t5_lora_inference.yaml create mode 100644 examples/nlp/language_modeling/tuning/conf/megatron_t5_lora_tuning_config.yaml create mode 100644 examples/nlp/language_modeling/tuning/megatron_t5_lora_eval.py create mode 100644 examples/nlp/language_modeling/tuning/megatron_t5_lora_tuning.py diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_t5_lora_inference.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_t5_lora_inference.yaml new file mode 100644 index 000000000000..008241d19389 --- /dev/null +++ b/examples/nlp/language_modeling/tuning/conf/megatron_t5_lora_inference.yaml @@ -0,0 +1,36 @@ +inference: + greedy: True # Whether or not to use sampling ; use greedy decoding otherwise + top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. + temperature: 1.0 # sampling temperature + add_BOS: True # add the bos token at the begining of the prompt + tokens_to_generate: 30 # The minimum length of the sequence to be generated. + all_probs: False # whether return the log prob for all the tokens in vocab + repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. + min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. + compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False + + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + logger: False # logger provided by exp_manager + precision: 16 # 16, 32, or bf16 + +data: + test_ds: ??? + num_workers: 1 + global_batch_size: 4 + micro_batch_size: 4 + +tensor_model_parallel_size: -1 +pipeline_model_parallel_size: -1 +pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) +language_model_path: ??? # GPT nemo file path # used when starting from a .nemo file +adapter_model_file: ??? # .nemo file saved during training (using megatron_t5_lora_tuning.py) +pred_file_path: null # save predictions to this file +checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training +checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading +hparams_file: null # model configuration file, only used for PTL checkpoint loading +batch_size: 8 diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_t5_lora_tuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_t5_lora_tuning_config.yaml new file mode 100644 index 000000000000..6663df58c823 --- /dev/null +++ b/examples/nlp/language_modeling/tuning/conf/megatron_t5_lora_tuning_config.yaml @@ -0,0 +1,99 @@ +name: adapter_tuning_${model.new_tasks[0]}_max_epochs${trainer.max_epochs}_lora_dim${model.lora_tuning.kqv_adapter_dim} + +trainer: + devices: 1 + accelerator: gpu + num_nodes: 1 + precision: 16 + logger: False + enable_checkpointing: False + replace_sampler_ddp: False + max_epochs: 10 + max_steps: 1000 + log_every_n_steps: 1 + val_check_interval: 2 + accumulate_grad_batches: 1 + gradient_clip_val: 0.0 + resume_from_checkpoint: null + benchmark: False + +exp_manager: + explicit_log_dir: null + exp_dir: nemo-lora-mt0-tr + name: ${name} + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: reduced_train_loss + save_top_k: 1 + mode: min + save_nemo_on_train_end: True # Should be false, correct prompt learning model file is saved at model.virtual_prompt_save_path set below + filename: "megatron_t5_adapter_tune--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}" + model_parallel_size: ${model.tensor_model_parallel_size} + save_best_model: True + +model: + seed: 1234 + nemo_path: ${exp_manager.exp_dir}/${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved + virtual_prompt_style: 'no-prompts' #'prompt-tuning' # adapter tuning requires no virtual prompts + encoder_seq_length: 2048 + gradient_as_bucket_view: false + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + global_batch_size: 4 + micro_batch_size: 4 + validation_global_batch_size: ${model.global_batch_size} + validation_micro_batch_size: ${model.micro_batch_size} + validation_drop_last: False + report_validation_metric: False + validation_metric: accuracy + + restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with + language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required + existing_tasks: [] + new_tasks: ["taskname"] + + task_templates: + - taskname: "taskname" # The task name + prompt_template: "{prompt} {completion}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|> + total_virtual_tokens: 0 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time. + virtual_token_splits: [] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens + truncate_field: "prompt" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped. + answer_field: "completion" + + lora_tuning: + kqv_adapter_dim: 24 + kv_adapter_dim: 16 + q_adapter_dim: 8 + adapter_dropout: 0.1 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + + data: + train_ds: ??? + validation_ds: ??? + shuffle: True + num_workers: 0 + pin_memory: True + add_eos: True + + + optim: + name: fused_adam + lr: 1e-3 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 50 + constant_steps: 0 + min_lr: 0.0 + monitor: val_loss + reduce_on_plateau: false diff --git a/examples/nlp/language_modeling/tuning/megatron_t5_lora_eval.py b/examples/nlp/language_modeling/tuning/megatron_t5_lora_eval.py new file mode 100644 index 000000000000..d9de94843071 --- /dev/null +++ b/examples/nlp/language_modeling/tuning/megatron_t5_lora_eval.py @@ -0,0 +1,160 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch +import torch.multiprocessing as mp +from megatron.core import parallel_state +from omegaconf import OmegaConf +from omegaconf.omegaconf import open_dict +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5LoraModel +from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy +from nemo.core.config import hydra_runner +from nemo.utils.app_state import AppState + +mp.set_start_method("spawn", force=True) + +""" +This is the script to run an Adapter Tuned GPT Model for text generation. + +Usage: + Assume the model has TP=1, PP=1 in the following use cases. + a. run greedy inference using a base gpt nemo file, and an adapter nemo file: + python megatron_gpt_ia3_eval.py \ + gpt_model_file=PATH TO GPT MODEL NEMO FILE \ + adapter_model_file=PATH TO ADAPTER MODEL NEMO FILE (generated by training script: ./megatron_gpt_ia3_tuning.py) \ + data_paths=[PATH TO A JSONL FILE CONTAINING PROMPTS], \ + pred_file_path=PATH TO OUTPUT FILE TO DUMP PREDICTIONS +""" + +if not torch.cuda.is_available(): + raise EnvironmentError("GPU is needed for the inference") + + +@hydra_runner(config_path="conf", config_name="megatron_t5_adapter_inference") +def main(cfg) -> None: + + # trainer required for restoring model parallel models + trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) + + if ( + cfg.tensor_model_parallel_size < 0 + or cfg.pipeline_model_parallel_size < 0 + or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 + ): + model_config = MegatronT5LoraModel.restore_from( + restore_path=cfg.language_model_path, trainer=trainer, return_config=True, + ) + + with open_dict(cfg): + cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) + cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1) + cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0) + + app_state = AppState() + if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: + app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size + ( + app_state.tensor_model_parallel_rank, + app_state.pipeline_model_parallel_rank, + app_state.model_parallel_size, + app_state.data_parallel_size, + app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, + ) = fake_initialize_model_parallel( + world_size=app_state.model_parallel_size, + rank=trainer.global_rank, + tensor_model_parallel_size_=cfg.tensor_model_parallel_size, + pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, + pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, + ) + + # Load an adapter model, must be provided in config + if cfg.get("adapter_model_file", None) is not None and cfg.get("language_model_path", None) is not None: + # Update frozen GPT model path in case it has changed + adapter_tuning_cfg = MegatronT5LoraModel.restore_from( + cfg.adapter_model_file, trainer=trainer, return_config=True + ) + with open_dict(adapter_tuning_cfg): + adapter_tuning_cfg.language_model_path = cfg.language_model_path + adapter_tuning_cfg.pretrained_language_model_path = cfg.language_model_path + adapter_tuning_cfg.micro_batch_size = cfg.data.micro_batch_size + adapter_tuning_cfg.global_batch_size = cfg.data.global_batch_size + + # Now load prompt learning model with frozen gpt model base + model = MegatronT5LoraModel.restore_from( + restore_path=cfg.adapter_model_file, trainer=trainer, override_config_path=adapter_tuning_cfg + ) + + # Or load regular GPT model + else: + raise NotImplementedError( + "This script is meant for inference from an Infused Adapter Tuned T5 Model, config should contain an adapter_model_file and a language_model_path" + ) + + # check whether the DDP is initialized + if parallel_state.is_unitialized(): + + def dummy(): + return + + if trainer.strategy.launcher is not None: + trainer.strategy.launcher.launch(dummy, trainer=trainer) + trainer.strategy.setup_environment() + + model.freeze() + + # Have to turn off activations_checkpoint_method for inference + try: + model.model.language_model.encoder.activations_checkpoint_method = None + except AttributeError: + pass + + try: + model.frozen_model.model.language_model.encoder.activations_checkpoint_method = None + except AttributeError: + pass + + test_ds, test_dl = model.build_virtual_prompt_dataset( + dataset_paths=cfg.data.test_ds, + batch_size=cfg.data.global_batch_size, + for_train=False, + drop_last=False, + shuffle=False, + num_workers=cfg.data.num_workers, + pin_memory=True, + ) + + config = OmegaConf.to_container(cfg.inference) + model.set_inference_config(config) + response = trainer.predict(model, test_dl) + print("***************************") + if cfg.pred_file_path is not None: + with open(cfg.pred_file_path, "w", encoding="utf-8") as f: + for batch in response: + for inp, pred in zip(batch['input_text'], batch['preds_text']): + inp = ' '.join(inp.split('\n')) + pred = ' '.join(pred.split('\n')) + f.write(f'{inp} {pred}\n') + print("predictions saved to {}".format(cfg.pred_file_path)) + else: + print(response) + print("***************************") + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter diff --git a/examples/nlp/language_modeling/tuning/megatron_t5_lora_tuning.py b/examples/nlp/language_modeling/tuning/megatron_t5_lora_tuning.py new file mode 100644 index 000000000000..b2a45d0ec3fd --- /dev/null +++ b/examples/nlp/language_modeling/tuning/megatron_t5_lora_tuning.py @@ -0,0 +1,107 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch.multiprocessing as mp +from omegaconf.omegaconf import OmegaConf, open_dict +from pytorch_lightning import Trainer +from pytorch_lightning.plugins.environments import TorchElasticEnvironment + +from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5LoraModel +from nemo.collections.nlp.parts.nlp_overrides import ( + GradScaler, + MegatronHalfPrecisionPlugin, + NLPDDPStrategy, + NLPSaveRestoreConnector, + PipelineMixedPrecisionPlugin, +) +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager + +mp.set_start_method("spawn", force=True) + +""" +This is the script to train an Adapter infused GPT Model for text generation. +A base GPT Model is required as a starting point. This script will then insert +Adapters into each Transformer layer and will train/update only these adapters +during training. The base GPT Model weights will remain frozen. + +During training this script will only save the newly trained Adapter weights +in checkpoints. At the end of training a .nemo file of Adapter weights will +be saved. + +Usage: + Assuming the base model is a 125m GPT Model, with TP=1, PP=1: + a. run a training run for a base gpt nemo file: + python megatron_gpt_adapter_tuning.py \ + "model.data.train_ds=[PATH TO TRAINING JSONL FILE]", + "model.data.validation_ds=[PATH TO VALIDATION JSONL FILE]", + model.language_model_path="PATH TO BASE GPT MODEL .nemo FILE" + name="NAME OF TRAINING RUN" + exp_manager.exp_dir="DIR TO SAVE CHECKPOINTS and .nemo FILE", + trainer.max_epochs=2 +""" + + +@hydra_runner(config_path="conf", config_name="megatron_t5_lora_tuning_config") +def main(cfg) -> None: + logging.info("\n\n************** Experiment configuration ***********") + logging.info(f'\n{OmegaConf.to_yaml(cfg)}') + + megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) + with_distributed_adam = cfg.model.optim.get('name') == 'distributed_fused_adam' + + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) + if cfg.trainer.precision in [16, 'bf16']: + scaler = None + if cfg.trainer.precision == 16: + scaler = GradScaler( + init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), + growth_interval=cfg.model.get('native_amp_growth_interval', 1000), + hysteresis=cfg.model.get('hysteresis', 2), + ) + if megatron_amp_o2 and not with_distributed_adam: + plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) + else: + plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) + + if cfg.get('cluster_type', None) == 'BCP': + plugins.append(TorchElasticEnvironment()) + + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) + exp_manager(trainer, cfg.exp_manager) + + # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams + with open_dict(cfg): + cfg.model.precision = cfg.trainer.precision + + # load existing or init new soft prompt GPT model + if cfg.model.get("restore_path", None): + model = MegatronT5LoraModel.restore_from( + cfg.model.restore_path, cfg.model, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector() + ) + else: + model = MegatronT5LoraModel(cfg.model, trainer=trainer) + + trainer.fit(model) + + +if __name__ == '__main__': + main() diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py index 31c147022486..03bc11cc3d3c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py @@ -35,6 +35,9 @@ from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ( AdapterName, InfusedAdapterConfig, + LoraKQVAdapterConfig, + LoraKVAdapterConfig, + LoraQAdapterConfig, MLPInfusedAdapterConfig, ParallelLinearAdapterConfig, ) @@ -420,6 +423,132 @@ def list_available_models(cls): pass +class MegatronT5LoraModel(MegatronT5BaseAdapterModel): + """ + TODO (@adithyare) + """ + + def __init__(self, cfg: DictConfig, trainer: Trainer): + super().__init__(cfg, trainer) + # assert cfg.lora_tuning.get('adapter_dim', 0) > 0, "adapter_dim has not been set." + # assert ( + # cfg.lora_tuning.adapter_dim % cfg.tensor_model_parallel_size == 0 + # ), "The adapter dim should be divisible by tensor_model_parallel_size." + + encoder_adapter_name_keys = [AdapterName.LORA_KQV_ADAPTER] + decoder_adapter_name_keys = [ + AdapterName.LORA_KQV_ADAPTER, + AdapterName.LORA_KV_ADAPTER, + AdapterName.LORA_Q_ADAPTER, + ] + + # add adapter keys to the list -> to update state dict + self.adapter_name_keys = encoder_adapter_name_keys + decoder_adapter_name_keys + + frozen_model_cfg = MegatronT5Model.restore_from( + cfg.get('language_model_path'), trainer=trainer, return_config=True + ) + for _, layer in self.frozen_model.named_modules(): + if hasattr(layer, 'activations_checkpoint_method'): + layer.activations_checkpoint_method = ( + None # (@adithyare) adapter learning does not support activations checkpointing atm. + ) + + self.frozen_model.freeze() + logging.info(f'Before adding adapters:\n{self.frozen_model.summarize()}') + encoder = self.frozen_model.enc_dec_model.enc_dec_model.encoder + decoder = self.frozen_model.enc_dec_model.enc_dec_model.decoder + + if encoder: + encoder_cfg = self._get_component_cfg('encoder', frozen_model_cfg, cfg) + self._add_adapters_to_component(encoder, encoder_cfg, encoder_adapter_name_keys) + logging.info(f'Adding encoder adapters:\n{self.frozen_model.summarize()}') + + if decoder: + decoder_cfg = self._get_component_cfg('decoder', frozen_model_cfg, cfg) + self._add_adapters_to_component(decoder, decoder_cfg, decoder_adapter_name_keys) + logging.info(f'Adding decoder adapters:\n{self.frozen_model.summarize()}') + + def _add_adapters_to_component(self, component, component_cfg, adapter_name_keys): + for _, module in component.named_modules(): + if isinstance(module, adapter_mixins.AdapterModuleMixin): + for adapter_key in adapter_name_keys: + adapter_cfg = self._get_adapter_cfg(component_cfg, adapter_key) + if model_utils.import_class_by_path(adapter_cfg._target_) in module.get_accepted_adapter_types(): + module.add_adapter(name=adapter_key, cfg=adapter_cfg) + print(f"in adding {adapter_key}") + + def _get_component_cfg(self, component_name, frozen_model_cfg, cfg): + if component_name in frozen_model_cfg: + component_cfg = frozen_model_cfg.get(component_name) + with open_dict(component_cfg): + component_cfg.tensor_model_parallel_size = frozen_model_cfg.tensor_model_parallel_size + component_cfg.lora_tuning = cfg.lora_tuning + else: + component_cfg = frozen_model_cfg + with open_dict(component_cfg): + component_cfg.lora_tuning = cfg.lora_tuning + return component_cfg + + def _get_adapter_cfg(self, component_cfg, adapter_key): + if component_cfg.kv_channels is None: + assert ( + component_cfg.hidden_size % component_cfg.num_attention_heads == 0 + ), 'hidden_size must be divisible by num_attention_heads if kv_channels is None' + kv_channels = component_cfg.hidden_size // component_cfg.num_attention_heads + else: + kv_channels = component_cfg.kv_channels + projection_size = kv_channels * component_cfg.num_attention_heads + + if adapter_key == AdapterName.LORA_KQV_ADAPTER: + adapter_cfg = LoraKQVAdapterConfig( + in_features=component_cfg.hidden_size, + out_features=3 * projection_size, + dim=component_cfg.lora_tuning.kqv_adapter_dim, + norm_position="none", + norm_type="none", + activation="identity", + column_init_method=component_cfg.lora_tuning.get("column_init_method", "normal"), + row_init_method=component_cfg.lora_tuning.get("row_init_method", "zero"), + gather_output=False, + dropout=0.0, + ) + elif adapter_key == AdapterName.LORA_KV_ADAPTER: + adapter_cfg = LoraKVAdapterConfig( + in_features=component_cfg.hidden_size, + out_features=2 * projection_size, + dim=component_cfg.lora_tuning.kv_adapter_dim, + norm_position="none", + norm_type="none", + activation="identity", + column_init_method=component_cfg.lora_tuning.get("column_init_method", "normal"), + row_init_method=component_cfg.lora_tuning.get("row_init_method", "zero"), + gather_output=False, + dropout=0.0, + ) + elif adapter_key == AdapterName.LORA_Q_ADAPTER: + adapter_cfg = LoraQAdapterConfig( + in_features=component_cfg.hidden_size, + out_features=1 * projection_size, + dim=component_cfg.lora_tuning.q_adapter_dim, + norm_position="none", + norm_type="none", + activation="identity", + column_init_method=component_cfg.lora_tuning.get("column_init_method", "normal"), + row_init_method=component_cfg.lora_tuning.get("row_init_method", "zero"), + gather_output=False, + dropout=0.0, + ) + else: + raise RuntimeError("Unexpected adapter key name..") + + return adapter_cfg + + @classmethod + def list_available_models(cls): + pass + + class MegatronT5InfusedAdapterModel(MegatronT5BaseAdapterModel): """ MegatronGPTInfusedAdapterModel is a model that combines a base model (GPTModel) with a "Infused Adapter that can Inhibiting and Amplify Inner Activations", known as IA3. diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index b26b971a38ba..679020019ab1 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -202,7 +202,25 @@ class ParallelLinearAdapterConfig: class LoraKQVAdapter(ParallelLinearAdapter): """ - Lora Adapters are the same arch as regualr adapters but with potentially different input and output feature sizes + Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes + and they do not use an bottleneck activation function + """ + + pass + + +class LoraKVAdapter(ParallelLinearAdapter): + """ + Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes + and they do not use an bottleneck activation function + """ + + pass + + +class LoraQAdapter(ParallelLinearAdapter): + """ + Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes and they do not use an bottleneck activation function """ @@ -214,6 +232,16 @@ class LoraKQVAdapterConfig(ParallelLinearAdapterConfig): _target_: str = "{0}.{1}".format(LoraKQVAdapter.__module__, LoraKQVAdapter.__name__) +@dataclass +class LoraQAdapterConfig(ParallelLinearAdapterConfig): + _target_: str = "{0}.{1}".format(LoraQAdapter.__module__, LoraQAdapter.__name__) + + +@dataclass +class LoraKVAdapterConfig(ParallelLinearAdapterConfig): + _target_: str = "{0}.{1}".format(LoraKVAdapter.__module__, LoraKVAdapter.__name__) + + class PromptEncoderAdapter(nn.Module, AdapterModuleUtil): """ The Tensor Parallel MLP prompt encoder network that is used to generate the virtual diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py index aaeb05d43cde..9c954b5e6313 100644 --- a/nemo/collections/nlp/modules/common/megatron/attention.py +++ b/nemo/collections/nlp/modules/common/megatron/attention.py @@ -22,6 +22,8 @@ AdapterName, InfusedAdapterConfig, LoraKQVAdapterConfig, + LoraKVAdapterConfig, + LoraQAdapterConfig, ) from nemo.collections.nlp.modules.common.megatron.fused_softmax import MatchedScaleMaskSoftmax from nemo.collections.nlp.modules.common.megatron.module import MegatronModule @@ -115,7 +117,14 @@ def __init__( self.megatron_legacy = megatron_legacy self.dtype = utils_funcs.dtype_from_precision(precision, megatron_amp_O2) - self.set_accepted_adapter_types([InfusedAdapterConfig._target_, LoraKQVAdapterConfig._target_]) + self.set_accepted_adapter_types( + [ + InfusedAdapterConfig._target_, + LoraKQVAdapterConfig._target_, + LoraQAdapterConfig._target_, + LoraKVAdapterConfig._target_, + ] + ) if kv_channels is None: assert ( @@ -395,6 +404,11 @@ def forward( else: # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] mixed_kv_layer, _ = self.key_value(encoder_output) + if self.is_adapter_available(): + lora_kv_adapter = self.get_adapter_module(AdapterName.LORA_KV_ADAPTER) + if lora_kv_adapter: + lora_mixed_kv_layer = lora_kv_adapter(encoder_output) + mixed_kv_layer = mixed_kv_layer + lora_mixed_kv_layer # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn] new_tensor_shape = mixed_kv_layer.size()[:-1] + ( @@ -412,6 +426,11 @@ def forward( # Attention head [sq, b, h] --> [sq, b, hp] query_layer, _ = self.query(hidden_states) + if self.is_adapter_available(): + lora_q_adapter = self.get_adapter_module(AdapterName.LORA_Q_ADAPTER) + if lora_q_adapter: + lora_q_layer = lora_q_adapter(hidden_states) + query_layer = query_layer + lora_q_layer # [sq, b, hp] --> [sq, b, np, hn] new_tensor_shape = query_layer.size()[:-1] + ( self.num_attention_heads_per_partition, From 72faf557a1d207425fd3cb68369603e3c369a9c7 Mon Sep 17 00:00:00 2001 From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> Date: Thu, 8 Jun 2023 20:43:45 -0700 Subject: [PATCH 035/123] NFA updates (#6695) * update V_NEGATIVE_NUM constant to make better use of torch.float32 range Signed-off-by: Elena Rastorgueva * adjust backpointers dtype if U_max too large Signed-off-by: Elena Rastorgueva * Remove print statements Signed-off-by: Elena Rastorgueva * Remove need for user to specify model_downsample_factor Signed-off-by: Elena Rastorgueva * change model.cfg.sample_rate to model.cfg.preprocessor.sample_rate Signed-off-by: Elena Rastorgueva * add check to make sure that window_stride is in model.cfg.preprocessor Signed-off-by: Elena Rastorgueva * reduce memory consumption of backpointers by making them relative instead of absolute Signed-off-by: Elena Rastorgueva * update librosa.get_duration() 'filename' param to 'path' Signed-off-by: Elena Rastorgueva * Do not throw error if 'text' or 'pred_text' are empty and make sure CTM filepaths in the output manifest are null Signed-off-by: Elena Rastorgueva * preprocess input text by removing any duplicate spaces and converting any newlines to spaces Signed-off-by: Elena Rastorgueva * Use Utterance dataclass instead of dictionaries for keeping track of token/word/segment alignments Signed-off-by: Elena Rastorgueva * refactor so can save alignments as ctm and ass format files Signed-off-by: Elena Rastorgueva * fix bugs for saving character based ASS files and for using pred_text to do alignment Signed-off-by: Elena Rastorgueva * Make token level .ass file use tokens with recovered capitalization Signed-off-by: Elena Rastorgueva * Do not try to generate alignment files if text or pred text is empty, or if number of tokens is too large for T Signed-off-by: Elena Rastorgueva * rename output manifest file to say '_with_output_file_paths.json' Signed-off-by: Elena Rastorgueva * add flag to resegment ass subtitle file to fill available text space Signed-off-by: Elena Rastorgueva * Fix bug in resegmentation code Signed-off-by: Elena Rastorgueva * Fix bug which skipped some utterances if batch_size more than 1 Signed-off-by: Elena Rastorgueva * reduce memory requirements by doing torch.gather on a slice of the log probs when they are needed Signed-off-by: Elena Rastorgueva * reduce memory requirements by not saving whole v_matrix Signed-off-by: Elena Rastorgueva * remove any extra spaces in pred_text Signed-off-by: Elena Rastorgueva * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove unused list pred_text_all_lines Signed-off-by: Elena Rastorgueva * support using hybrid Transducer-CTC models for alignment Signed-off-by: Elena Rastorgueva * fix typo - add brackets to torch.cuda.is_available() Signed-off-by: Elena Rastorgueva * make sure token case restoration will work if superscript or subscript num is in text Signed-off-by: Elena Rastorgueva * remove any BOM from input text Signed-off-by: Elena Rastorgueva * pick out 1st hypotheses if there is a tuple of them Signed-off-by: Elena Rastorgueva * Remove print statement Signed-off-by: Elena Rastorgueva * add detail to error message if fail to recover capitalization of tokens Signed-off-by: Elena Rastorgueva * add flag use_local_attention Signed-off-by: Elena Rastorgueva * rename additional_ctm_grouping_separator -> additional_segment_grouping_separator Signed-off-by: Elena Rastorgueva * update description of additional_segment_grouping_separator Signed-off-by: Elena Rastorgueva * add simple docstring to get_utt_obj function Signed-off-by: Elena Rastorgueva * Make docstring for add_t_start_end_to_utt_obj Signed-off-by: Elena Rastorgueva * update docstrings for add_t_start_end_to_utt_obj and get_batch_variables Signed-off-by: Elena Rastorgueva * update README and comments in align.py Signed-off-by: Elena Rastorgueva * change 'ground truth' -> 'reference text' in documentation Signed-off-by: Elena Rastorgueva * add header Signed-off-by: Elena Rastorgueva * add comments to get_utt_obj function Signed-off-by: Elena Rastorgueva * move constants so they are after imports Signed-off-by: Elena Rastorgueva * add file description for make_ass_files Signed-off-by: Elena Rastorgueva * get rid of Utterance object's S attribute, and correct tests so they pass now Signed-off-by: Elena Rastorgueva * remove some unused variables Signed-off-by: Elena Rastorgueva * remove unused variable model from functions saving output files Signed-off-by: Elena Rastorgueva * remove unused var minimum_timestamp_duration from make_ass_files functions and return utt_obj Signed-off-by: Elena Rastorgueva * move minimum_timestamp_duration param to CTMFileConfig Signed-off-by: Elena Rastorgueva * remove unused enumerate and unused import Signed-off-by: Elena Rastorgueva * switch reading duration from librosa to soundfile to avoid filename/path deprecation message Signed-off-by: Elena Rastorgueva --------- Signed-off-by: Elena Rastorgueva Signed-off-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- tools/nemo_forced_aligner/README.md | 69 +- tools/nemo_forced_aligner/align.py | 186 +++-- tools/nemo_forced_aligner/requirements.txt | 3 +- .../test_add_t_start_end_to_boundary_info.py | 121 --- .../tests/test_add_t_start_end_to_utt_obj.py | 288 +++++++ .../tests/test_get_utt_obj.py | 344 ++++++++ .../test_get_y_and_boundary_info_for_utt.py | 158 ---- .../tests/test_restore_token_case.py | 36 + tools/nemo_forced_aligner/utils/constants.py | 2 +- tools/nemo_forced_aligner/utils/data_prep.py | 752 ++++++++++++++---- .../utils/make_ass_files.py | 428 ++++++++++ .../utils/make_ctm_files.py | 114 +++ .../utils/make_output_files.py | 209 ----- .../utils/make_output_manifest.py | 35 + .../utils/viterbi_decoding.py | 70 +- 15 files changed, 2011 insertions(+), 804 deletions(-) delete mode 100644 tools/nemo_forced_aligner/tests/test_add_t_start_end_to_boundary_info.py create mode 100644 tools/nemo_forced_aligner/tests/test_add_t_start_end_to_utt_obj.py create mode 100644 tools/nemo_forced_aligner/tests/test_get_utt_obj.py delete mode 100644 tools/nemo_forced_aligner/tests/test_get_y_and_boundary_info_for_utt.py create mode 100644 tools/nemo_forced_aligner/tests/test_restore_token_case.py create mode 100644 tools/nemo_forced_aligner/utils/make_ass_files.py create mode 100644 tools/nemo_forced_aligner/utils/make_ctm_files.py delete mode 100644 tools/nemo_forced_aligner/utils/make_output_files.py create mode 100644 tools/nemo_forced_aligner/utils/make_output_manifest.py diff --git a/tools/nemo_forced_aligner/README.md b/tools/nemo_forced_aligner/README.md index 35ee78ffecb0..423c76878db6 100644 --- a/tools/nemo_forced_aligner/README.md +++ b/tools/nemo_forced_aligner/README.md @@ -7,7 +7,6 @@ A tool for doing Forced Alignment using Viterbi decoding of NeMo CTC-based model ``` bash python /tools/nemo_forced_aligner/align.py \ pretrained_name="stt_en_citrinet_1024_gamma_0_25" \ - model_downsample_factor=8 \ manifest_filepath= \ output_dir= ``` @@ -23,34 +22,44 @@ Call the `align.py` script, specifying the parameters as follows: * `model_path`: string specifying the local filepath to a CTC NeMo ASR model which will be used to generate the log-probs which we will use to do alignment. If `pretrained_name` is specified, `model_path` must not be specified. >Note: NFA can only use CTC models (not Transducer models) at the moment. If you want to transcribe a long audio file (longer than ~5-10 mins), do not use Conformer CTC model as that will likely give Out Of Memory errors. -* `model_downsample_factor`: the downsample factor of the ASR model. It should be 2 if your model is QuartzNet, 4 if it is Conformer CTC, 8 if it is Citrinet. - * `manifest_filepath`: The path to the manifest of the data you want to align, containing `'audio_filepath'` and `'text'` fields. The audio filepaths need to be absolute paths. -* `output_dir`: The folder where to save CTM files containing the generated alignments and new JSON manifest containing paths to those CTM files. There will be one CTM file per utterance (ie one CTM file per line in the manifest). The files will be called `/{tokens,words,additional_segments}/.ctm` and each line in each file will start with ``. By default, `utt_id` will be the stem of the audio_filepath. This can be changed by overriding `audio_filepath_parts_in_utt_id`. The new JSON manifest will be at `/_with_ctm_paths.json`. +* `output_dir`: The folder where to save the output files (e.g. CTM, ASS) containing the generated alignments and new JSON manifest containing paths to those CTM/ASS files. The CTM file will be called `/ctm/{tokens,words,segments}/.ctm` and each line in each file will start with ``. By default, `utt_id` will be the stem of the audio_filepath. This can be changed by overriding `audio_filepath_parts_in_utt_id`. The new JSON manifest will be at `/_with_ctm_paths.json`. The ASS files will be at `/ass/{tokens,words}/.ass`. You can adjust which files should be saved by adjusting the parameter `save_output_file_formats`. + +### Optional parameters: + +* `align_using_pred_text`: if True, will transcribe the audio using the ASR model (specified by `pretrained_name` or `model_path`) and then use that transcription as the reference text for the forced alignment. The `"pred_text"` will be saved in the output JSON manifest at `/{original manifest name}_with_ctm_paths.json`. To avoid over-writing other transcribed texts, if there are already `"pred_text"` entries in the original manifest, the program will exit without attempting to generate alignments. (Default: False). + +* `transcribe_device`: The device that will be used for generating log-probs (i.e. transcribing). If None, NFA will set it to 'cuda' if it is available (otherwise will set it to 'cpu'). If specified `transcribe_device` needs to be a string that can be input to the `torch.device()` method. (Default: `None`). + +* `viterbi_device`: The device that will be used for doing Viterbi decoding. If None, NFA will set it to 'cuda' if it is available (otherwise will set it to 'cpu'). If specified `transcribe_device` needs to be a string that can be input to the `torch.device()` method.(Default: `None`). -* **[OPTIONAL]** `align_using_pred_text`: if True, will transcribe the audio using the ASR model (specified by `pretrained_name` or `model_path`) and then use that transcription as the 'ground truth' for the forced alignment. The `"pred_text"` will be saved in the output JSON manifest at `/{original manifest name}_with_ctm_paths.json`. To avoid over-writing other transcribed texts, if there are already `"pred_text"` entries in the original manifest, the program will exit without attempting to generate alignments. (Default: False). +* `batch_size`: The batch_size that will be used for generating log-probs and doing Viterbi decoding. (Default: 1). -* **[OPTIONAL]** `transcribe_device`: The device that will be used for generating log-probs (i.e. transcribing). If None, NFA will set it to 'cuda' if it is available (otherwise will set it to 'cpu'). If specified `transcribe_device` needs to be a string that can be input to the `torch.device()` method. (Default: `None`). +* `use_local_attention`: boolean flag specifying whether to try to use local attention for the ASR Model (will only work if the ASR Model is a Conformer model). If local attention is used, we will set the local attention context size to [64,64]. -* **[OPTIONAL]** `viterbi_device`: The device that will be used for doing Viterbi decoding. If None, NFA will set it to 'cuda' if it is available (otherwise will set it to 'cpu'). If specified `transcribe_device` needs to be a string that can be input to the `torch.device()` method.(Default: `None`). +* `additional_segment_grouping_separator`: an optional string used to separate the text into smaller segments. If this is not specified, then the whole text will be treated as a single segment. (Default: `None`. Cannot be empty string or space (" "), as NFA will automatically produce word-level timestamps for substrings separated by spaces). +> Note: the `additional_segment_grouping_separator` will be removed from the reference text and all the output files, ie it is treated as a marker which is not part of the reference text. The separator will essentially be treated as a space, and any additional spaces around it will be amalgamated into one, i.e. if `additional_segment_grouping_separator="|"`, the following texts will be treated equivalently: `“abc|def”`, `“abc |def”`, `“abc| def”`, `“abc | def"`. -* **[OPTIONAL]** `batch_size`: The batch_size that will be used for generating log-probs and doing Viterbi decoding. (Default: 1). +* `remove_blank_tokens_from_ctm`: a boolean denoting whether to remove tokens from token-level output CTMs. (Default: False). -* **[OPTIONAL]** `additional_ctm_grouping_separator`: the string used to separate CTM segments if you want to obtain CTM files at a level that is not the token level or the word level. NFA will always produce token-level and word-level CTM files in: `/tokens/.ctm` and `/words/.ctm`. If `additional_ctm_grouping_separator` is specified, an additional folder `/{tokens/words/additional_segments}/.ctm` will be created containing CTMs for `addtional_ctm_grouping_separator`-separated segments. (Default: `None`. Cannot be empty string or space (" "), as space-separated word-level CTMs will always be saved in `/words/.ctm`.) -> Note: the `additional_ctm_grouping_separator` will be removed from the ground truth text and all the output CTMs, ie it is treated as a marker which is not part of the ground truth. The separator will essentially be treated as a space, and any additional spaces around it will be amalgamated into one, i.e. if `additional_ctm_grouping_separator="|"`, the following texts will be treated equivalently: `“abc|def”`, `“abc |def”`, `“abc| def”`, `“abc | def"`. +* `audio_filepath_parts_in_utt_id`: This specifies how many of the 'parts' of the audio_filepath we will use (starting from the final part of the audio_filepath) to determine the utt_id that will be used in the CTM files. (Default: 1, i.e. utt_id will be the stem of the basename of audio_filepath). Note also that any spaces that are present in the audio_filepath will be replaced with dashes, so as not to change the number of space-separated elements in the CTM files. -* **[OPTIONAL]** `remove_blank_tokens_from_ctm`: a boolean denoting whether to remove tokens from token-level output CTMs. (Default: False). +* `minimum_timestamp_duration`: a float indicating a minimum duration (in seconds) for timestamps in the CTM. If any line in the CTM has a duration lower than the `minimum_timestamp_duration`, it will be enlarged from the middle outwards until it meets the minimum_timestamp_duration, or reaches the beginning or end of the audio file. Note that this may cause timestamps to overlap. (Default: 0, i.e. no modifications to predicted duration). -* **[OPTIONAL]** `audio_filepath_parts_in_utt_id`: This specifies how many of the 'parts' of the audio_filepath we will use (starting from the final part of the audio_filepath) to determine the utt_id that will be used in the CTM files. (Default: 1, i.e. utt_id will be the stem of the basename of audio_filepath). Note also that any spaces that are present in the audio_filepath will be replaced with dashes, so as not to change the number of space-separated elements in the CTM files. +* `use_buffered_chunked_streaming`: a flag to indicate whether to do buffered chunk streaming. Notice only CTC models (e.g., stt_en_citrinet_1024_gamma_0_25)with `per_feature` preprocessor are supported. The below two params are needed if this option set to `True`. -* **[OPTIONAL]** `minimum_timestamp_duration`: a float indicating a minimum duration (in seconds) for timestamps in the CTM. If any line in the CTM has a duration lower than the `minimum_timestamp_duration`, it will be enlarged from the middle outwards until it meets the minimum_timestamp_duration, or reaches the beginning or end of the audio file. Note that this may cause timestamps to overlap. (Default: 0, i.e. no modifications to predicted duration). +* `chunk_len_in_secs`: the chunk size for buffered chunked streaming inference. Default is 1.6 seconds. -* **[OPTIONAL]** `use_buffered_chunked_streaming`: a flag to indicate whether to do buffered chunk streaming. Notice only CTC models (e.g., stt_en_citrinet_1024_gamma_0_25)with `per_feature` preprocessor are supported. The below two params are needed if this option set to `True`. +* `total_buffer_in_secs`: the buffer size for buffered chunked streaming inference. Default is 4.0 seconds. -* **[OPTIONAL]** `chunk_len_in_secs`: the chunk size for buffered chunked streaming inference. Default is 1.6 seconds. +* `simulate_cache_aware_streaming`: a flag to indicate whether to use cache aware streaming to do get the logits for alignment. Default: `False`. -* **[OPTIONAL]** `total_buffer_in_secs`: the buffer size for buffered chunked streaming inference. Default is 4.0 seconds. +* `save_output_file_formats`: list of file formats to use for saving the output. Default: `["ctm", "ass"]` (these are all the available ones currently). + +* `ctm_file_config`: `CTMFileConfig` to specify the configuration of the output CTM files. + +* `ass_file_config`: `ASSFileConfig` to specify the configuration of the output ASS files. # Input manifest file format By default, NFA needs to be provided with a 'manifest' file where each line specifies the absolute "audio_filepath" and "text" of each utterance that you wish to produce alignments for, like the format below: @@ -58,25 +67,35 @@ By default, NFA needs to be provided with a 'manifest' file where each line spec {"audio_filepath": "/absolute/path/to/audio.wav", "text": "the transcription of the utterance"} ``` -You can omit the `"text"` field from the manifest if you specify `align_using_pred_text=true`. In that case, any `"text"` fields in the manifest will be ignored: the ASR model at `pretrained_name` or `model_path` will be used to transcribe the audio and obtain `"pred_text"`, which will be used as the 'ground truth' for the forced alignment process. The `"pred_text"` will also be saved in the output manifest JSON file at `/_with_ctm_paths.json`. To remove the possibility of overwriting `"pred_text"`, NFA will raise an error if `align_using_pred_text=true` and there are existing `"pred_text"` fields in the original manifest. +You can omit the `"text"` field from the manifest if you specify `align_using_pred_text=true`. In that case, any `"text"` fields in the manifest will be ignored: the ASR model at `pretrained_name` or `model_path` will be used to transcribe the audio and obtain `"pred_text"`, which will be used as the reference text for the forced alignment process. The `"pred_text"` will also be saved in the output manifest JSON file at `/_with_output_file_paths.json`. To remove the possibility of overwriting `"pred_text"`, NFA will raise an error if `align_using_pred_text=true` and there are existing `"pred_text"` fields in the original manifest. -> Note: NFA does not require `"duration"` fields in the manifest, and can align long audio files without running out of memory. Depending on your machine specs, you can align audios up to 5-10 minutes on Conformer CTC models, up to around 1.5 hours for QuartzNet models, and up to several hours for Citrinet models. NFA will also produce better alignments the more accurate the ground-truth `"text"` is. +> Note: NFA does not require `"duration"` fields in the manifest, and can align long audio files without running out of memory. The duration of audio file you can align will depend on the amount of memory on your machine. NFA will also produce better alignments the more accurate the reference text in `"text"` is. # Output CTM file format For each utterance specified in a line of `manifest_filepath`, several CTM files will be generated: -* a CTM file containing token-level alignments at `/tokens/.ctm`, -* a CTM file containing word-level alignments at `/words/.ctm`, -* if `additional_ctm_grouping_separator` is specified, there will also be a CTM file containing those segments at `output_dir/additional_segments`. +* a CTM file containing token-level alignments at `/ctm/tokens/.ctm`, +* a CTM file containing word-level alignments at `/ctm/words/.ctm`, +* a CTM file containing segment-level alignments at `/ctm/segments/.ctm`. If `additional_segment_grouping_separator` is specified, the segments will be parts of the text separated by `additonal_segment_grouping_separator`. If it is not specified, the entire text will be treated as a single segment. + Each CTM file will contain lines of the format: ` 1 `. Note the second item in the line (the 'channel ID', which is required by the CTM file format) is always 1, as NFA operates on single channel audio. +# Output ASS file format +NFA will produce the following ASS files, which you can use to generate subtitle videos: +* ASS files with token-level highlighting will be at `/ass/tokens/.ass,` +* ASS files with word-level highlighting will be at `/ass/words/.ass`. +All words belonging to the same segment 'segments' will appear at the same time in the subtitles generated with the ASS files. If you find that your segments are not the right size, you can use set `ass_file_config.resegment_text_to_fill_space=true` and specify some number of `ass_file_config.max_lines_per_segment`. + + # Output JSON manifest file format -A new manifest file will be saved at `/_with_ctm_paths.json`. It will contain the same fields as the original manifest, and additionally: -* `"token_level_ctm_filepath"` -* `"word_level_ctm_filepath"` -* `"additonal_segment_level_ctm_filepath"` (if `additional_ctm_grouping_separator` is specified) +A new manifest file will be saved at `/_with_output_file_paths.json`. It will contain the same fields as the original manifest, and additionally: +* `"token_level_ctm_filepath"` (if `save_output_file_formats` contains `ctm`) +* `"word_level_ctm_filepath"` (if `save_output_file_formats` contains `ctm`) +* `"segment_level_ctm_filepath"` (if `save_output_file_formats` contains `ctm`) +* `"token_level_ass_filepath"` (if `save_output_file_formats` contains `ass`) +* `"word_level_ass_filepath"` (if `save_output_file_formats` contains `ass`) * `"pred_text"` (if `align_using_pred_text=true`) diff --git a/tools/nemo_forced_aligner/align.py b/tools/nemo_forced_aligner/align.py index ed3ca3e45b5b..296c4a009cc4 100644 --- a/tools/nemo_forced_aligner/align.py +++ b/tools/nemo_forced_aligner/align.py @@ -15,22 +15,27 @@ import copy import math import os -from dataclasses import dataclass, is_dataclass -from typing import Optional +from dataclasses import dataclass, field, is_dataclass +from pathlib import Path +from typing import List, Optional import torch from omegaconf import OmegaConf from utils.data_prep import ( + add_t_start_end_to_utt_obj, get_batch_starts_ends, - get_batch_tensors_and_boundary_info, + get_batch_variables, get_manifest_lines_batch, is_entry_in_all_lines, is_entry_in_any_lines, ) -from utils.make_output_files import make_ctm, make_new_manifest +from utils.make_ass_files import make_ass_files +from utils.make_ctm_files import make_ctm_files +from utils.make_output_manifest import write_manifest_out_line from utils.viterbi_decoding import viterbi_decoding from nemo.collections.asr.models.ctc_models import EncDecCTCModel +from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchASR from nemo.collections.asr.parts.utils.transcribe_utils import setup_model from nemo.core.config import hydra_runner @@ -48,16 +53,11 @@ log-probs which we will use to do alignment. Note: NFA can only use CTC models (not Transducer models) at the moment. Note: if a model_path is provided, it will override the pretrained_name. - model_downsample_factor: an int indicating the downsample factor of the ASR model, ie the ratio of input - timesteps to output timesteps. - If the ASR model is a QuartzNet model, its downsample factor is 2. - If the ASR model is a Conformer CTC model, its downsample factor is 4. - If the ASR model is a Citirnet model, its downsample factor is 8. manifest_filepath: filepath to the manifest of the data you want to align, containing 'audio_filepath' and 'text' fields. output_dir: the folder where output CTM files and new JSON manifest will be saved. align_using_pred_text: if True, will transcribe the audio using the specified model and then use that transcription - as the 'ground truth' for the forced alignment. + as the reference text for the forced alignment. transcribe_device: None, or a string specifying the device that will be used for generating log-probs (i.e. "transcribing"). The string needs to be in a format recognized by torch.device(). If None, NFA will set it to 'cuda' if it is available (otherwise will set it to 'cpu'). @@ -65,12 +65,11 @@ The string needs to be in a format recognized by torch.device(). If None, NFA will set it to 'cuda' if it is available (otherwise will set it to 'cpu'). batch_size: int specifying batch size that will be used for generating log-probs and doing Viterbi decoding. - additional_ctm_grouping_separator: the string used to separate CTM segments if you want to obtain CTM files at a - level that is not the token level or the word level. NFA will always produce token-level and word-level CTM - files in: `/tokens/.ctm` and `/words/.ctm`. - If `additional_ctm_grouping_separator` is specified, an additional folder - `/{tokens/words/additional_segments}/.ctm` will be created containing CTMs - for `addtional_ctm_grouping_separator`-separated segments. + use_local_attention: boolean flag specifying whether to try to use local attention for the ASR Model (will only + work if the ASR Model is a Conformer model). If local attention is used, we will set the local attention context + size to [64,64]. + additional_segment_grouping_separator: an optional string used to separate the text into smaller segments. + If this is not specified, then the whole text will be treated as a single segment. remove_blank_tokens_from_ctm: a boolean denoting whether to remove tokens from token-level output CTMs. audio_filepath_parts_in_utt_id: int specifying how many of the 'parts' of the audio_filepath we will use (starting from the final part of the audio_filepath) to determine the @@ -80,11 +79,6 @@ e.g. if audio_filepath is "/a/b/c/d/e 1.wav" and audio_filepath_parts_in_utt_id is 1 => utt_id will be "e1" e.g. if audio_filepath is "/a/b/c/d/e 1.wav" and audio_filepath_parts_in_utt_id is 2 => utt_id will be "d_e1" e.g. if audio_filepath is "/a/b/c/d/e 1.wav" and audio_filepath_parts_in_utt_id is 3 => utt_id will be "c_d_e1" - minimum_timestamp_duration: a float indicating a minimum duration (in seconds) for timestamps in the CTM. If any - line in the CTM has a duration lower than the `minimum_timestamp_duration`, it will be enlarged from the - middle outwards until it meets the minimum_timestamp_duration, or reaches the beginning or end of the audio - file. Note that this may cause timestamps to overlap. - use_buffered_infer: False, if set True, using streaming to do get the logits for alignment This flag is useful when aligning large audio file. However, currently the chunk streaming inference does not support batch inference, @@ -96,15 +90,39 @@ which will cut one audio into segments and do inference on chunk_batch_size segments at a time simulate_cache_aware_streaming: False, if set True, using cache aware streaming to do get the logits for alignment + + save_output_file_formats: List of strings specifying what type of output files to save (default: ["ctm", "ass"]) + ctm_file_config: CTMFileConfig to specify the configuration of the output CTM files + ass_file_config: ASSFileConfig to specify the configuration of the output ASS files """ +@dataclass +class CTMFileConfig: + remove_blank_tokens: bool = False + # minimum duration (in seconds) for timestamps in the CTM.If any line in the CTM has a + # duration lower than this, it will be enlarged from the middle outwards until it + # meets the minimum_timestamp_duration, or reaches the beginning or end of the audio file. + # Note that this may cause timestamps to overlap. + minimum_timestamp_duration: float = 0 + + +@dataclass +class ASSFileConfig: + fontsize: int = 20 + marginv: int = 20 + # if resegment_text_to_fill_space is True, the ASS files will use new segments + # such that each segment will not take up more than (approximately) max_lines_per_segment + # when the ASS file is applied to a video + resegment_text_to_fill_space: bool = False + max_lines_per_segment: int = 2 + + @dataclass class AlignmentConfig: # Required configs pretrained_name: Optional[str] = None model_path: Optional[str] = None - model_downsample_factor: Optional[int] = None manifest_filepath: Optional[str] = None output_dir: Optional[str] = None @@ -113,9 +131,8 @@ class AlignmentConfig: transcribe_device: Optional[str] = None viterbi_device: Optional[str] = None batch_size: int = 1 - additional_ctm_grouping_separator: Optional[str] = None - remove_blank_tokens_from_ctm: bool = False - minimum_timestamp_duration: float = 0 + use_local_attention: bool = True + additional_segment_grouping_separator: Optional[str] = None audio_filepath_parts_in_utt_id: int = 1 # Buffered chunked streaming configs @@ -127,6 +144,11 @@ class AlignmentConfig: # Cache aware streaming configs simulate_cache_aware_streaming: Optional[bool] = False + # Output file configs + save_output_file_formats: List[str] = field(default_factory=lambda: ["ctm", "ass"]) + ctm_file_config: CTMFileConfig = CTMFileConfig() + ass_file_config: ASSFileConfig = ASSFileConfig() + @hydra_runner(config_name="AlignmentConfig", schema=AlignmentConfig) def main(cfg: AlignmentConfig): @@ -143,9 +165,6 @@ def main(cfg: AlignmentConfig): if cfg.model_path is not None and cfg.pretrained_name is not None: raise ValueError("One of cfg.model_path and cfg.pretrained_name must be None") - if cfg.model_downsample_factor is None: - raise ValueError("cfg.model_downsample_factor must be specified") - if cfg.manifest_filepath is None: raise ValueError("cfg.manifest_filepath must be specified") @@ -155,10 +174,10 @@ def main(cfg: AlignmentConfig): if cfg.batch_size < 1: raise ValueError("cfg.batch_size cannot be zero or a negative number") - if cfg.additional_ctm_grouping_separator == "" or cfg.additional_ctm_grouping_separator == " ": + if cfg.additional_segment_grouping_separator == "" or cfg.additional_segment_grouping_separator == " ": raise ValueError("cfg.additional_grouping_separator cannot be empty string or space character") - if cfg.minimum_timestamp_duration < 0: + if cfg.ctm_file_config.minimum_timestamp_duration < 0: raise ValueError("cfg.minimum_timestamp_duration cannot be a negative number") # Validate manifest contents @@ -179,18 +198,18 @@ def main(cfg: AlignmentConfig): if not is_entry_in_all_lines(cfg.manifest_filepath, "text"): raise RuntimeError( "At least one line in cfg.manifest_filepath does not contain a 'text' entry. " - "NFA requires all lines to contain a 'text' entry when cfg.align_using_pred_text=True." + "NFA requires all lines to contain a 'text' entry when cfg.align_using_pred_text=False." ) # init devices if cfg.transcribe_device is None: - transcribe_device = torch.device("cuda" if torch.cuda.is_available else "cpu") + transcribe_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") else: transcribe_device = torch.device(cfg.transcribe_device) logging.info(f"Device to be used for transcription step (`transcribe_device`) is {transcribe_device}") if cfg.viterbi_device is None: - viterbi_device = torch.device("cuda" if torch.cuda.is_available else "cpu") + viterbi_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") else: viterbi_device = torch.device(cfg.viterbi_device) logging.info(f"Device to be used for viterbi step (`viterbi_device`) is {viterbi_device}") @@ -205,15 +224,24 @@ def main(cfg: AlignmentConfig): model, _ = setup_model(cfg, transcribe_device) model.eval() - if not isinstance(model, EncDecCTCModel): + if isinstance(model, EncDecHybridRNNTCTCModel): + model.change_decoding_strategy(decoder_type="ctc") + + if cfg.use_local_attention: + logging.info( + "Flag use_local_attention is set to True => will try to use local attention for model if it allows it" + ) + model.change_attention_model(self_attention_model="rel_pos_local_attn", att_context_size=[64, 64]) + + if not (isinstance(model, EncDecCTCModel) or isinstance(model, EncDecHybridRNNTCTCModel)): raise NotImplementedError( - f"Model {cfg.model_name} is not an instance of NeMo EncDecCTCModel." - " Currently only instances of EncDecCTCModels are supported" + f"Model is not an instance of NeMo EncDecCTCModel or ENCDecHybridRNNTCTCModel." + " Currently only instances of these models are supported" ) - if cfg.minimum_timestamp_duration > 0: + if cfg.ctm_file_config.minimum_timestamp_duration > 0: logging.warning( - f"cfg.minimum_timestamp_duration has been set to {cfg.minimum_timestamp_duration} seconds. " + f"cfg.ctm_file_config.minimum_timestamp_duration has been set to {cfg.ctm_file_config.minimum_timestamp_duration} seconds. " "This may cause the alignments for some tokens/words/additional segments to be overlapping." ) @@ -255,84 +283,48 @@ def main(cfg: AlignmentConfig): # get start and end line IDs of batches starts, ends = get_batch_starts_ends(cfg.manifest_filepath, cfg.batch_size) - if cfg.align_using_pred_text: - # record pred_texts to save them in the new manifest at the end of this script - pred_text_all_lines = [] - else: - pred_text_all_lines = None + # init output_timestep_duration = None and we will calculate and update it during the first batch + output_timestep_duration = None + + # init f_manifest_out + os.makedirs(cfg.output_dir, exist_ok=True) + tgt_manifest_name = str(Path(cfg.manifest_filepath).stem) + "_with_output_file_paths.json" + tgt_manifest_filepath = str(Path(cfg.output_dir) / tgt_manifest_name) + f_manifest_out = open(tgt_manifest_filepath, 'w') # get alignment and save in CTM batch-by-batch for start, end in zip(starts, ends): manifest_lines_batch = get_manifest_lines_batch(cfg.manifest_filepath, start, end) - ( - log_probs_batch, - y_batch, - T_batch, - U_batch, - token_info_batch, - word_info_batch, - segment_info_batch, - pred_text_batch, - ) = get_batch_tensors_and_boundary_info( + (log_probs_batch, y_batch, T_batch, U_batch, utt_obj_batch, output_timestep_duration,) = get_batch_variables( manifest_lines_batch, model, - cfg.additional_ctm_grouping_separator, + cfg.additional_segment_grouping_separator, cfg.align_using_pred_text, + cfg.audio_filepath_parts_in_utt_id, + output_timestep_duration, cfg.simulate_cache_aware_streaming, cfg.use_buffered_chunked_streaming, buffered_chunk_params, ) - if cfg.align_using_pred_text: - pred_text_all_lines.extend(pred_text_batch) - alignments_batch = viterbi_decoding(log_probs_batch, y_batch, T_batch, U_batch, viterbi_device) - make_ctm( - token_info_batch, - alignments_batch, - manifest_lines_batch, - model, - cfg.model_downsample_factor, - os.path.join(cfg.output_dir, "tokens"), - cfg.remove_blank_tokens_from_ctm, - cfg.audio_filepath_parts_in_utt_id, - cfg.minimum_timestamp_duration, - ) + for utt_obj, alignment_utt in zip(utt_obj_batch, alignments_batch): - make_ctm( - word_info_batch, - alignments_batch, - manifest_lines_batch, - model, - cfg.model_downsample_factor, - os.path.join(cfg.output_dir, "words"), - False, # dont try to remove blank tokens because we dont expect them to be there anyway - cfg.audio_filepath_parts_in_utt_id, - cfg.minimum_timestamp_duration, - ) + utt_obj = add_t_start_end_to_utt_obj(utt_obj, alignment_utt, output_timestep_duration) + + if "ctm" in cfg.save_output_file_formats: + utt_obj = make_ctm_files(utt_obj, cfg.output_dir, cfg.ctm_file_config,) + + if "ass" in cfg.save_output_file_formats: + utt_obj = make_ass_files(utt_obj, cfg.output_dir, cfg.ass_file_config) - if cfg.additional_ctm_grouping_separator: - make_ctm( - segment_info_batch, - alignments_batch, - manifest_lines_batch, - model, - cfg.model_downsample_factor, - os.path.join(cfg.output_dir, "additional_segments"), - False, # dont try to remove blank tokens because we dont expect them to be there anyway - cfg.audio_filepath_parts_in_utt_id, - cfg.minimum_timestamp_duration, + write_manifest_out_line( + f_manifest_out, utt_obj, ) - make_new_manifest( - cfg.output_dir, - cfg.manifest_filepath, - cfg.additional_ctm_grouping_separator, - cfg.audio_filepath_parts_in_utt_id, - pred_text_all_lines, - ) + f_manifest_out.close() return None diff --git a/tools/nemo_forced_aligner/requirements.txt b/tools/nemo_forced_aligner/requirements.txt index 3af8ebf1b488..9daa6d2f2496 100644 --- a/tools/nemo_forced_aligner/requirements.txt +++ b/tools/nemo_forced_aligner/requirements.txt @@ -1,2 +1,3 @@ nemo_toolkit[all] -pytest +prettyprinter # for testing +pytest # for testing diff --git a/tools/nemo_forced_aligner/tests/test_add_t_start_end_to_boundary_info.py b/tools/nemo_forced_aligner/tests/test_add_t_start_end_to_boundary_info.py deleted file mode 100644 index 406c4be1fb70..000000000000 --- a/tools/nemo_forced_aligner/tests/test_add_t_start_end_to_boundary_info.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from utils.make_output_files import add_t_start_end_to_boundary_info - -ALIGNMENT = [ - 1, - 1, - 3, - 3, - 4, - 5, - 7, - 7, - 9, - 10, - 11, - 12, - 13, - 15, - 17, - 17, - 19, - 21, - 23, - 23, -] - -INPUT_TOKEN_INFO = [ - {'text': '', 's_start': 0, 's_end': 0}, - {'text': 'h', 's_start': 1, 's_end': 1}, - {'text': '', 's_start': 2, 's_end': 2}, - {'text': 'i', 's_start': 3, 's_end': 3}, - {'text': '', 's_start': 4, 's_end': 4}, - {'text': '', 's_start': 5, 's_end': 5}, - {'text': '', 's_start': 6, 's_end': 6}, - {'text': 'w', 's_start': 7, 's_end': 7}, - {'text': '', 's_start': 8, 's_end': 8}, - {'text': 'o', 's_start': 9, 's_end': 9}, - {'text': '', 's_start': 10, 's_end': 10}, - {'text': 'r', 's_start': 11, 's_end': 11}, - {'text': '', 's_start': 12, 's_end': 12}, - {'text': 'l', 's_start': 13, 's_end': 13}, - {'text': '', 's_start': 14, 's_end': 14}, - {'text': 'd', 's_start': 15, 's_end': 15}, - {'text': '', 's_start': 16, 's_end': 16}, - {'text': '', 's_start': 17, 's_end': 17}, - {'text': '', 's_start': 18, 's_end': 18}, - {'text': 'h', 's_start': 19, 's_end': 19}, - {'text': '', 's_start': 20, 's_end': 20}, - {'text': 'e', 's_start': 21, 's_end': 21}, - {'text': '', 's_start': 22, 's_end': 22}, - {'text': 'y', 's_start': 23, 's_end': 23}, - {'text': '', 's_start': 24, 's_end': 24}, -] - -EXPECTED_OUTPUT_TOKEN_INFO = [ - {'text': 'h', 's_start': 1, 's_end': 1, 't_start': 0, 't_end': 1}, - {'text': 'i', 's_start': 3, 's_end': 3, 't_start': 2, 't_end': 3}, - {'text': '', 's_start': 4, 's_end': 4, 't_start': 4, 't_end': 4}, - {'text': '', 's_start': 5, 's_end': 5, 't_start': 5, 't_end': 5}, - {'text': 'w', 's_start': 7, 's_end': 7, 't_start': 6, 't_end': 7}, - {'text': 'o', 's_start': 9, 's_end': 9, 't_start': 8, 't_end': 8}, - {'text': '', 's_start': 10, 's_end': 10, 't_start': 9, 't_end': 9}, - {'text': 'r', 's_start': 11, 's_end': 11, 't_start': 10, 't_end': 10}, - {'text': '', 's_start': 12, 's_end': 12, 't_start': 11, 't_end': 11}, - {'text': 'l', 's_start': 13, 's_end': 13, 't_start': 12, 't_end': 12}, - {'text': 'd', 's_start': 15, 's_end': 15, 't_start': 13, 't_end': 13}, - {'text': '', 's_start': 17, 's_end': 17, 't_start': 14, 't_end': 15}, - {'text': 'h', 's_start': 19, 's_end': 19, 't_start': 16, 't_end': 16}, - {'text': 'e', 's_start': 21, 's_end': 21, 't_start': 17, 't_end': 17}, - {'text': 'y', 's_start': 23, 's_end': 23, 't_start': 18, 't_end': 19}, -] - - -INPUT_WORD_INFO = [ - {'text': 'hi', 's_start': 1, 's_end': 3}, - {'text': 'world', 's_start': 7, 's_end': 15}, - {'text': 'hey', 's_start': 19, 's_end': 23}, -] - -EXPECTED_OUTPUT_WORD_INFO = [ - {'text': 'hi', 's_start': 1, 's_end': 3, 't_start': 0, 't_end': 3}, - {'text': 'world', 's_start': 7, 's_end': 15, 't_start': 6, 't_end': 13}, - {'text': 'hey', 's_start': 19, 's_end': 23, 't_start': 16, 't_end': 19}, -] - -INPUT_SEGMENT_INFO = [ - {'text': 'hi world', 's_start': 1, 's_end': 15}, - {'text': 'hey', 's_start': 19, 's_end': 23}, -] - -EXPECTED_OUTPUT_SEGMENT_INFO = [ - {'text': 'hi world', 's_start': 1, 's_end': 15, 't_start': 0, 't_end': 13}, - {'text': 'hey', 's_start': 19, 's_end': 23, 't_start': 16, 't_end': 19}, -] - - -@pytest.mark.parametrize( - "input_boundary_info_utt,alignment_utt,expected_output_boundary_info_utt", - [ - (INPUT_TOKEN_INFO, ALIGNMENT, EXPECTED_OUTPUT_TOKEN_INFO), - (INPUT_WORD_INFO, ALIGNMENT, EXPECTED_OUTPUT_WORD_INFO), - (INPUT_SEGMENT_INFO, ALIGNMENT, EXPECTED_OUTPUT_SEGMENT_INFO), - ], -) -def test_add_t_start_end_to_boundary_info(input_boundary_info_utt, alignment_utt, expected_output_boundary_info_utt): - output_boundary_info_utt = add_t_start_end_to_boundary_info(input_boundary_info_utt, alignment_utt) - assert output_boundary_info_utt == expected_output_boundary_info_utt diff --git a/tools/nemo_forced_aligner/tests/test_add_t_start_end_to_utt_obj.py b/tools/nemo_forced_aligner/tests/test_add_t_start_end_to_utt_obj.py new file mode 100644 index 000000000000..62092d5afaeb --- /dev/null +++ b/tools/nemo_forced_aligner/tests/test_add_t_start_end_to_utt_obj.py @@ -0,0 +1,288 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +import pytest +from utils.data_prep import Segment, Token, Utterance, Word, add_t_start_end_to_utt_obj + +OUTPUT_TIMESTEP_DURATION = 0.04 + +ALIGNMENT = [ + 1, + 1, + 3, + 3, + 4, + 5, + 7, + 7, + 9, + 10, + 11, + 12, + 13, + 15, + 17, + 17, + 19, + 21, + 23, + 23, +] + +EXPECTED_OUTPUT_UTTERANCE = Utterance( + text='hi world | hey', + token_ids_with_blanks=[ + 28, + 8, + 28, + 9, + 28, + 0, + 28, + 23, + 28, + 15, + 28, + 18, + 28, + 12, + 28, + 4, + 28, + 0, + 28, + 8, + 28, + 5, + 28, + 25, + 28, + ], + segments_and_tokens=[ + Token(text='', text_cased='', s_start=0, s_end=0, t_start=-1, t_end=-1), + Segment( + text="hi world", + s_start=1, + s_end=15, + t_start=0 * OUTPUT_TIMESTEP_DURATION, + t_end=14 * OUTPUT_TIMESTEP_DURATION, + words_and_tokens=[ + Word( + text="hi", + s_start=1, + s_end=3, + t_start=0 * OUTPUT_TIMESTEP_DURATION, + t_end=4 * OUTPUT_TIMESTEP_DURATION, + tokens=[ + Token( + text='h', + text_cased='h', + s_start=1, + s_end=1, + t_start=0 * OUTPUT_TIMESTEP_DURATION, + t_end=2 * OUTPUT_TIMESTEP_DURATION, + ), + Token(text='', text_cased='', s_start=2, s_end=2, t_start=-1, t_end=-1), + Token( + text='i', + text_cased='i', + s_start=3, + s_end=3, + t_start=2 * OUTPUT_TIMESTEP_DURATION, + t_end=4 * OUTPUT_TIMESTEP_DURATION, + ), + ], + ), + Token( + text='', + text_cased='', + s_start=4, + s_end=4, + t_start=4 * OUTPUT_TIMESTEP_DURATION, + t_end=5 * OUTPUT_TIMESTEP_DURATION, + ), + Token( + text='', + text_cased='', + s_start=5, + s_end=5, + t_start=5 * OUTPUT_TIMESTEP_DURATION, + t_end=6 * OUTPUT_TIMESTEP_DURATION, + ), + Token(text='', text_cased='', s_start=6, s_end=6, t_start=-1, t_end=-1), + Word( + text="world", + s_start=7, + s_end=15, + t_start=6 * OUTPUT_TIMESTEP_DURATION, + t_end=14 * OUTPUT_TIMESTEP_DURATION, + tokens=[ + Token( + text='w', + text_cased='w', + s_start=7, + s_end=7, + t_start=6 * OUTPUT_TIMESTEP_DURATION, + t_end=8 * OUTPUT_TIMESTEP_DURATION, + ), + Token(text='', text_cased='', s_start=8, s_end=8, t_start=-1, t_end=-1), + Token( + text='o', + text_cased='o', + s_start=9, + s_end=9, + t_start=8 * OUTPUT_TIMESTEP_DURATION, + t_end=9 * OUTPUT_TIMESTEP_DURATION, + ), + Token( + text='', + text_cased='', + s_start=10, + s_end=10, + t_start=9 * OUTPUT_TIMESTEP_DURATION, + t_end=10 * OUTPUT_TIMESTEP_DURATION, + ), + Token( + text='r', + text_cased='r', + s_start=11, + s_end=11, + t_start=10 * OUTPUT_TIMESTEP_DURATION, + t_end=11 * OUTPUT_TIMESTEP_DURATION, + ), + Token( + text='', + text_cased='', + s_start=12, + s_end=12, + t_start=11 * OUTPUT_TIMESTEP_DURATION, + t_end=12 * OUTPUT_TIMESTEP_DURATION, + ), + Token( + text='l', + text_cased='l', + s_start=13, + s_end=13, + t_start=12 * OUTPUT_TIMESTEP_DURATION, + t_end=13 * OUTPUT_TIMESTEP_DURATION, + ), + Token(text='', text_cased='', s_start=14, s_end=14, t_start=-1, t_end=-1), + Token( + text='d', + text_cased='d', + s_start=15, + s_end=15, + t_start=13 * OUTPUT_TIMESTEP_DURATION, + t_end=14 * OUTPUT_TIMESTEP_DURATION, + ), + ], + ), + ], + ), + Token(text='', text_cased='', s_start=16, s_end=16, t_start=-1, t_end=-1), + Token( + text='', + text_cased='', + s_start=17, + s_end=17, + t_start=14 * OUTPUT_TIMESTEP_DURATION, + t_end=16 * OUTPUT_TIMESTEP_DURATION, + ), + Token(text='', text_cased='', s_start=18, s_end=18, t_start=-1, t_end=-1), + Segment( + text="hey", + s_start=19, + s_end=23, + t_start=16 * OUTPUT_TIMESTEP_DURATION, + t_end=20 * OUTPUT_TIMESTEP_DURATION, + words_and_tokens=[ + Word( + text="hey", + s_start=19, + s_end=23, + t_start=16 * OUTPUT_TIMESTEP_DURATION, + t_end=20 * OUTPUT_TIMESTEP_DURATION, + tokens=[ + Token( + text='h', + text_cased='h', + s_start=19, + s_end=19, + t_start=16 * OUTPUT_TIMESTEP_DURATION, + t_end=17 * OUTPUT_TIMESTEP_DURATION, + ), + Token(text='', text_cased='', s_start=20, s_end=20, t_start=-1, t_end=-1), + Token( + text='e', + text_cased='e', + s_start=21, + s_end=21, + t_start=17 * OUTPUT_TIMESTEP_DURATION, + t_end=18 * OUTPUT_TIMESTEP_DURATION, + ), + Token(text='', text_cased='', s_start=22, s_end=22, t_start=-1, t_end=-1), + Token( + text='y', + text_cased='y', + s_start=23, + s_end=23, + t_start=18 * OUTPUT_TIMESTEP_DURATION, + t_end=20 * OUTPUT_TIMESTEP_DURATION, + ), + ], + ) + ], + ), + Token(text='', text_cased='', s_start=24, s_end=24, t_start=-1, t_end=-1), + ], +) + + +@pytest.mark.parametrize( + "alignment,expected_output_utterance, output_timestep_duration", + [(ALIGNMENT, EXPECTED_OUTPUT_UTTERANCE, OUTPUT_TIMESTEP_DURATION),], +) +def test_add_t_start_end_to_utt_obj(alignment, expected_output_utterance, output_timestep_duration): + input_utterance = copy.deepcopy(expected_output_utterance) + + # set all t_start and t_end to None in input_utterance + for segment_or_token in input_utterance.segments_and_tokens: + if type(segment_or_token) is Segment: + segment = segment_or_token + segment.t_start = None + segment.t_end = None + + for word_or_token in segment.words_and_tokens: + if type(word_or_token) is Word: + word = word_or_token + word.t_start = None + word.t_end = None + + for token in word.tokens: + token.t_start = None + token.t_end = None + else: + token = word_or_token + token.t_start = None + token.t_end = None + + else: + token = segment_or_token + token.t_start = None + token.t_end = None + + output_utterance = add_t_start_end_to_utt_obj(input_utterance, alignment, output_timestep_duration) + assert output_utterance == expected_output_utterance diff --git a/tools/nemo_forced_aligner/tests/test_get_utt_obj.py b/tools/nemo_forced_aligner/tests/test_get_utt_obj.py new file mode 100644 index 000000000000..31dd978263c0 --- /dev/null +++ b/tools/nemo_forced_aligner/tests/test_get_utt_obj.py @@ -0,0 +1,344 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import prettyprinter +import pytest +from prettyprinter import pretty_call, register_pretty +from utils.data_prep import Segment, Token, Utterance, Word, get_utt_obj + +from nemo.collections.asr.models import ASRModel + + +def get_utt_obj_pp_string(utt_obj): + @register_pretty(Word) + def pretty_utterance(value, ctx): + return pretty_call( + ctx, + Word, + text=value.text, + s_start=value.s_start, + s_end=value.s_end, + t_start=value.t_start, + t_end=value.t_end, + tokens=value.tokens, + ) + + @register_pretty(Segment) + def pretty_utterance(value, ctx): + return pretty_call( + ctx, + Segment, + text=value.text, + s_start=value.s_start, + s_end=value.s_end, + t_start=value.t_start, + t_end=value.t_end, + words_and_tokens=value.words_and_tokens, + ) + + @register_pretty(Utterance) + def pretty_utterance(value, ctx): + return pretty_call( + ctx, + Utterance, + text=value.text, + token_ids_with_blanks=value.token_ids_with_blanks, + segments_and_tokens=value.segments_and_tokens, + audio_filepath=value.audio_filepath, + utt_id=value.utt_id, + ) + + return prettyprinter.pformat(utt_obj) + + +T_FOR_TEST = 999 +AUDIO_FILEPATH_FOR_TEST = "arbitrary_string.wav" +UTT_ID_FOR_TEST = "arbitrary_string" + +EN_TEXT = "hi world | hey" + +EN_CN_EXPECTED_UTTERANCE = Utterance( + text='hi world | hey', + token_ids_with_blanks=[1024, 317, 1024, 472, 1024, 25, 1024, 20, 1024], + segments_and_tokens=[ + Token(text='', text_cased='', s_start=0, s_end=0, t_start=None, t_end=None), + Segment( + text='hi world', + s_start=1, + s_end=3, + t_start=None, + t_end=None, + words_and_tokens=[ + Word( + text='hi', + s_start=1, + s_end=1, + t_start=None, + t_end=None, + tokens=[Token(text='▁hi', text_cased='▁hi', s_start=1, s_end=1, t_start=None, t_end=None)], + ), + Token(text='', text_cased='', s_start=2, s_end=2, t_start=None, t_end=None), + Word( + text='world', + s_start=3, + s_end=3, + t_start=None, + t_end=None, + tokens=[Token(text='▁world', text_cased='▁world', s_start=3, s_end=3, t_start=None, t_end=None)], + ), + ], + ), + Token(text='', text_cased='', s_start=4, s_end=4, t_start=None, t_end=None), + Segment( + text='hey', + s_start=5, + s_end=7, + t_start=None, + t_end=None, + words_and_tokens=[ + Word( + text='hey', + s_start=5, + s_end=7, + t_start=None, + t_end=None, + tokens=[ + Token(text='▁he', text_cased='▁he', s_start=5, s_end=5, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=6, s_end=6, t_start=None, t_end=None), + Token(text='y', text_cased='y', s_start=7, s_end=7, t_start=None, t_end=None), + ], + ) + ], + ), + Token(text='', text_cased='', s_start=8, s_end=8, t_start=None, t_end=None), + ], + audio_filepath=AUDIO_FILEPATH_FOR_TEST, + utt_id=UTT_ID_FOR_TEST, +) + +EN_QN_EXPECTED_UTTERANCE = Utterance( + text='hi world | hey', + token_ids_with_blanks=[ + 28, + 8, + 28, + 9, + 28, + 0, + 28, + 23, + 28, + 15, + 28, + 18, + 28, + 12, + 28, + 4, + 28, + 0, + 28, + 8, + 28, + 5, + 28, + 25, + 28, + ], + segments_and_tokens=[ + Token(text='', text_cased='', s_start=0, s_end=0, t_start=None, t_end=None), + Segment( + text="hi world", + s_start=1, + s_end=15, + t_start=None, + t_end=None, + words_and_tokens=[ + Word( + text="hi", + s_start=1, + s_end=3, + t_start=None, + t_end=None, + tokens=[ + Token(text='h', text_cased='h', s_start=1, s_end=1, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=2, s_end=2, t_start=None, t_end=None), + Token(text='i', text_cased='i', s_start=3, s_end=3, t_start=None, t_end=None), + ], + ), + Token(text='', text_cased='', s_start=4, s_end=4, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=5, s_end=5, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=6, s_end=6, t_start=None, t_end=None), + Word( + text="world", + s_start=7, + s_end=15, + t_start=None, + t_end=None, + tokens=[ + Token(text='w', text_cased='w', s_start=7, s_end=7, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=8, s_end=8, t_start=None, t_end=None), + Token(text='o', text_cased='o', s_start=9, s_end=9, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=10, s_end=10, t_start=None, t_end=None), + Token(text='r', text_cased='r', s_start=11, s_end=11, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=12, s_end=12, t_start=None, t_end=None), + Token(text='l', text_cased='l', s_start=13, s_end=13, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=14, s_end=14, t_start=None, t_end=None), + Token(text='d', text_cased='d', s_start=15, s_end=15, t_start=None, t_end=None), + ], + ), + ], + ), + Token(text='', text_cased='', s_start=16, s_end=16, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=17, s_end=17, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=18, s_end=18, t_start=None, t_end=None), + Segment( + text="hey", + s_start=19, + s_end=23, + t_start=None, + t_end=None, + words_and_tokens=[ + Word( + text="hey", + s_start=19, + s_end=23, + t_start=None, + t_end=None, + tokens=[ + Token(text='h', text_cased='h', s_start=19, s_end=19, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=20, s_end=20, t_start=None, t_end=None), + Token(text='e', text_cased='e', s_start=21, s_end=21, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=22, s_end=22, t_start=None, t_end=None), + Token(text='y', text_cased='y', s_start=23, s_end=23, t_start=None, t_end=None), + ], + ) + ], + ), + Token(text='', text_cased='', s_start=24, s_end=24, t_start=None, t_end=None), + ], + audio_filepath=AUDIO_FILEPATH_FOR_TEST, + utt_id=UTT_ID_FOR_TEST, +) + + +ZH_TEXT = "人工 智能|技术" + +ZH_CN_EXPECTED_UTTERANCE = Utterance( + text='人工 智能|技术', + token_ids_with_blanks=[ + 5206, + 125, + 5206, + 1329, + 5206, + 0, + 5206, + 2029, + 5206, + 3668, + 5206, + 0, + 5206, + 1695, + 5206, + 2075, + 5206, + ], + segments_and_tokens=[ + Token(text='', text_cased='', s_start=0, s_end=0, t_start=None, t_end=None), + Segment( + text='人工 智能', + s_start=1, + s_end=9, + t_start=None, + t_end=None, + words_and_tokens=[ + Word( + text='人工', + s_start=1, + s_end=3, + t_start=None, + t_end=None, + tokens=[ + Token(text='人', text_cased='人', s_start=1, s_end=1, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=2, s_end=2, t_start=None, t_end=None), + Token(text='工', text_cased='工', s_start=3, s_end=3, t_start=None, t_end=None), + ], + ), + Token(text='', text_cased='', s_start=4, s_end=4, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=5, s_end=5, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=6, s_end=6, t_start=None, t_end=None), + Word( + text='智能', + s_start=7, + s_end=9, + t_start=None, + t_end=None, + tokens=[ + Token(text='智', text_cased='智', s_start=7, s_end=7, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=8, s_end=8, t_start=None, t_end=None), + Token(text='能', text_cased='能', s_start=9, s_end=9, t_start=None, t_end=None), + ], + ), + ], + ), + Token(text='', text_cased='', s_start=10, s_end=10, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=11, s_end=11, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=12, s_end=12, t_start=None, t_end=None), + Segment( + text='技术', + s_start=13, + s_end=15, + t_start=None, + t_end=None, + words_and_tokens=[ + Word( + text='技术', + s_start=13, + s_end=15, + t_start=None, + t_end=None, + tokens=[ + Token(text='技', text_cased='技', s_start=13, s_end=13, t_start=None, t_end=None), + Token(text='', text_cased='', s_start=14, s_end=14, t_start=None, t_end=None), + Token(text='术', text_cased='术', s_start=15, s_end=15, t_start=None, t_end=None), + ], + ) + ], + ), + Token(text='', text_cased='', s_start=16, s_end=16, t_start=None, t_end=None), + ], + audio_filepath=AUDIO_FILEPATH_FOR_TEST, + utt_id=UTT_ID_FOR_TEST, +) + + +@pytest.mark.parametrize( + "text,model_pretrained_name,separator,expected_utterance", + [ + (EN_TEXT, "stt_en_citrinet_256_gamma_0_25", "|", EN_CN_EXPECTED_UTTERANCE), + (EN_TEXT, "stt_en_quartznet15x5", "|", EN_QN_EXPECTED_UTTERANCE), + (ZH_TEXT, "stt_zh_citrinet_512", "|", ZH_CN_EXPECTED_UTTERANCE), + ], +) +def test_token_info(text, model_pretrained_name, separator, expected_utterance): + model = ASRModel.from_pretrained(model_pretrained_name) + utt_obj = get_utt_obj( + text, model, separator, T=T_FOR_TEST, audio_filepath=AUDIO_FILEPATH_FOR_TEST, utt_id=UTT_ID_FOR_TEST + ) + print(f"expected utterance object: {get_utt_obj_pp_string(expected_utterance)}\n") + print(f"output utterance object in test: {get_utt_obj_pp_string(utt_obj)}\n") + + assert utt_obj == expected_utterance diff --git a/tools/nemo_forced_aligner/tests/test_get_y_and_boundary_info_for_utt.py b/tools/nemo_forced_aligner/tests/test_get_y_and_boundary_info_for_utt.py deleted file mode 100644 index f5bc722d5a1c..000000000000 --- a/tools/nemo_forced_aligner/tests/test_get_y_and_boundary_info_for_utt.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from utils.data_prep import get_y_and_boundary_info_for_utt - -from nemo.collections.asr.models import ASRModel - -EN_TEXT = "hi world | hey" - -EN_QN_EXPECTED_TOKEN_INFO = [ - {'text': '', 's_start': 0, 's_end': 0}, - {'text': 'h', 's_start': 1, 's_end': 1}, - {'text': '', 's_start': 2, 's_end': 2}, - {'text': 'i', 's_start': 3, 's_end': 3}, - {'text': '', 's_start': 4, 's_end': 4}, - {'text': '', 's_start': 5, 's_end': 5}, - {'text': '', 's_start': 6, 's_end': 6}, - {'text': 'w', 's_start': 7, 's_end': 7}, - {'text': '', 's_start': 8, 's_end': 8}, - {'text': 'o', 's_start': 9, 's_end': 9}, - {'text': '', 's_start': 10, 's_end': 10}, - {'text': 'r', 's_start': 11, 's_end': 11}, - {'text': '', 's_start': 12, 's_end': 12}, - {'text': 'l', 's_start': 13, 's_end': 13}, - {'text': '', 's_start': 14, 's_end': 14}, - {'text': 'd', 's_start': 15, 's_end': 15}, - {'text': '', 's_start': 16, 's_end': 16}, - {'text': '', 's_start': 17, 's_end': 17}, - {'text': '', 's_start': 18, 's_end': 18}, - {'text': 'h', 's_start': 19, 's_end': 19}, - {'text': '', 's_start': 20, 's_end': 20}, - {'text': 'e', 's_start': 21, 's_end': 21}, - {'text': '', 's_start': 22, 's_end': 22}, - {'text': 'y', 's_start': 23, 's_end': 23}, - {'text': '', 's_start': 24, 's_end': 24}, -] - -EN_QN_EXPECTED_WORD_INFO = [ - {'text': 'hi', 's_start': 1, 's_end': 3}, - {'text': 'world', 's_start': 7, 's_end': 15}, - {'text': 'hey', 's_start': 19, 's_end': 23}, -] - -EN_QN_EXPECTED_SEGMENT_INFO = [ - {'text': 'hi world', 's_start': 1, 's_end': 15}, - {'text': 'hey', 's_start': 19, 's_end': 23}, -] - -EN_CN_EXPECTED_TOKEN_INFO = [ - {'text': '', 's_start': 0, 's_end': 0}, - {'text': '▁hi', 's_start': 1, 's_end': 1}, - {'text': '', 's_start': 2, 's_end': 2}, - {'text': '▁world', 's_start': 3, 's_end': 3}, - {'text': '', 's_start': 4, 's_end': 4}, - {'text': '▁he', 's_start': 5, 's_end': 5}, - {'text': '', 's_start': 6, 's_end': 6}, - {'text': 'y', 's_start': 7, 's_end': 7}, - {'text': '', 's_start': 8, 's_end': 8}, -] - -EN_CN_EXPECTED_WORD_INFO = [ - {'text': 'hi', 's_start': 1, 's_end': 1}, - {'text': 'world', 's_start': 3, 's_end': 3}, - {'text': 'hey', 's_start': 5, 's_end': 7}, -] - -EN_CN_EXPECTED_SEGMENT_INFO = [ - {'text': 'hi world', 's_start': 1, 's_end': 3}, - {'text': 'hey', 's_start': 5, 's_end': 7}, -] - - -ZH_TEXT = "人工 智能|技术" - -ZH_EXPECTED_TOKEN_INFO = [ - {'text': '', 's_start': 0, 's_end': 0}, - {'text': '人', 's_start': 1, 's_end': 1}, - {'text': '', 's_start': 2, 's_end': 2}, - {'text': '工', 's_start': 3, 's_end': 3}, - {'text': '', 's_start': 4, 's_end': 4}, - {'text': '', 's_start': 5, 's_end': 5}, - {'text': '', 's_start': 6, 's_end': 6}, - {'text': '智', 's_start': 7, 's_end': 7}, - {'text': '', 's_start': 8, 's_end': 8}, - {'text': '能', 's_start': 9, 's_end': 9}, - {'text': '', 's_start': 10, 's_end': 10}, - {'text': '', 's_start': 11, 's_end': 11}, - {'text': '', 's_start': 12, 's_end': 12}, - {'text': '技', 's_start': 13, 's_end': 13}, - {'text': '', 's_start': 14, 's_end': 14}, - {'text': '术', 's_start': 15, 's_end': 15}, - {'text': '', 's_start': 16, 's_end': 16}, -] - -ZH_EXPECTED_WORD_INFO = [ - {'text': '人工', 's_start': 1, 's_end': 3}, - {'text': '智能', 's_start': 7, 's_end': 9}, - {'text': '技术', 's_start': 13, 's_end': 15}, -] - -ZH_EXPECTED_SEGMENT_INFO = [ - {'text': '人工 智能', 's_start': 1, 's_end': 9}, - {'text': '技术', 's_start': 13, 's_end': 15}, -] - - -@pytest.mark.parametrize( - "text,model_pretrained_name,separator,expected_token_info", - [ - (EN_TEXT, "stt_en_quartznet15x5", "|", EN_QN_EXPECTED_TOKEN_INFO), - (EN_TEXT, "stt_en_citrinet_256_gamma_0_25", "|", EN_CN_EXPECTED_TOKEN_INFO), - (ZH_TEXT, "stt_zh_citrinet_512", "|", ZH_EXPECTED_TOKEN_INFO), - ], -) -def test_token_info(text, model_pretrained_name, separator, expected_token_info): - model = ASRModel.from_pretrained(model_pretrained_name) - _, token_info, *_ = get_y_and_boundary_info_for_utt(text, model, separator) - assert token_info == expected_token_info - - -@pytest.mark.parametrize( - "text,model_pretrained_name,separator,expected_word_info", - [ - (EN_TEXT, "stt_en_quartznet15x5", "|", EN_QN_EXPECTED_WORD_INFO), - (EN_TEXT, "stt_en_citrinet_256_gamma_0_25", "|", EN_CN_EXPECTED_WORD_INFO), - (ZH_TEXT, "stt_zh_citrinet_512", "|", ZH_EXPECTED_WORD_INFO), - ], -) -def test_word_info(text, model_pretrained_name, separator, expected_word_info): - model = ASRModel.from_pretrained(model_pretrained_name) - _, _, word_info, _ = get_y_and_boundary_info_for_utt(text, model, separator) - assert word_info == expected_word_info - - -@pytest.mark.parametrize( - "text,model_pretrained_name,separator,expected_segment_info", - [ - (EN_TEXT, "stt_en_quartznet15x5", "|", EN_QN_EXPECTED_SEGMENT_INFO), - (EN_TEXT, "stt_en_citrinet_256_gamma_0_25", "|", EN_CN_EXPECTED_SEGMENT_INFO), - (ZH_TEXT, "stt_zh_citrinet_512", "|", ZH_EXPECTED_SEGMENT_INFO), - ], -) -def test_segment_info(text, model_pretrained_name, separator, expected_segment_info): - model = ASRModel.from_pretrained(model_pretrained_name) - *_, segment_info = get_y_and_boundary_info_for_utt(text, model, separator) - assert segment_info == expected_segment_info diff --git a/tools/nemo_forced_aligner/tests/test_restore_token_case.py b/tools/nemo_forced_aligner/tests/test_restore_token_case.py new file mode 100644 index 000000000000..6217dfc0ba94 --- /dev/null +++ b/tools/nemo_forced_aligner/tests/test_restore_token_case.py @@ -0,0 +1,36 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from utils.data_prep import restore_token_case + + +@pytest.mark.parametrize( + "word,word_tokens,expected_word_tokens_cased", + [ + ("HEY!", ['▁he', 'y', '!'], ['▁HE', 'Y', '!']), + ("BabABa▁", ['▁b', 'a', 'b', 'a', 'b', 'a'], ['▁B', 'a', 'b', 'A', 'B', 'a']), + ("BabAB▁a", ['▁b', 'a', 'b', 'a', 'b', '_a'], ['▁B', 'a', 'b', 'A', 'B', '_a']), + ("Bab▁AB▁a", ['▁b', 'a', 'b', '▁a', 'b', '▁a'], ['▁B', 'a', 'b', '▁A', 'B', '▁a']), + ("▁Bab▁AB▁a", ['▁b', 'a', 'b', '▁a', 'b', '▁a'], ['▁B', 'a', 'b', '▁A', 'B', '▁a']), + ("▁Bab▁AB▁▁a", ['▁b', 'a', 'b', '▁a', 'b', '▁a'], ['▁B', 'a', 'b', '▁A', 'B', '▁a']), + ("▁▁BabAB▁a", ['▁b', 'a', 'b', 'a', 'b', '▁a'], ['▁B', 'a', 'b', 'A', 'B', '▁a']), + ("m²", ['▁', 'm', '2'], ['▁', 'm', '2']), + ("²", ['▁', '2'], ['▁', '2']), + ], +) +def test_restore_token_case(word, word_tokens, expected_word_tokens_cased): + word_tokens_cased = restore_token_case(word, word_tokens) + assert word_tokens_cased == expected_word_tokens_cased diff --git a/tools/nemo_forced_aligner/utils/constants.py b/tools/nemo_forced_aligner/utils/constants.py index 894f880401cb..51ce934be479 100644 --- a/tools/nemo_forced_aligner/utils/constants.py +++ b/tools/nemo_forced_aligner/utils/constants.py @@ -16,4 +16,4 @@ SPACE_TOKEN = "" -V_NEGATIVE_NUM = -1e30 +V_NEGATIVE_NUM = -3.4e38 # this is just above the most negative number in torch.float32 diff --git a/tools/nemo_forced_aligner/utils/data_prep.py b/tools/nemo_forced_aligner/utils/data_prep.py index 852be91d78c4..20f401389c4e 100644 --- a/tools/nemo_forced_aligner/utils/data_prep.py +++ b/tools/nemo_forced_aligner/utils/data_prep.py @@ -13,13 +13,24 @@ # limitations under the License. import json -import os +from dataclasses import dataclass, field +from pathlib import Path +from typing import List, Union import soundfile as sf import torch from tqdm.auto import tqdm from utils.constants import BLANK_TOKEN, SPACE_TOKEN, V_NEGATIVE_NUM +from nemo.utils import logging + + +def _get_utt_id(audio_filepath, audio_filepath_parts_in_utt_id): + fp_parts = Path(audio_filepath).parts[-audio_filepath_parts_in_utt_id:] + utt_id = Path("_".join(fp_parts)).stem + utt_id = utt_id.replace(" ", "-") # replace any spaces in the filepath with dashes + return utt_id + def get_batch_starts_ends(manifest_filepath, batch_size): """ @@ -70,10 +81,16 @@ def is_entry_in_all_lines(manifest_filepath, entry): def get_manifest_lines_batch(manifest_filepath, start, end): manifest_lines_batch = [] - with open(manifest_filepath, "r") as f: + with open(manifest_filepath, "r", encoding="utf-8-sig") as f: for line_i, line in enumerate(f): if line_i >= start and line_i <= end: - manifest_lines_batch.append(json.loads(line)) + data = json.loads(line) + if "text" in data: + # remove any BOM, any duplicated spaces, convert any + # newline chars to spaces + data["text"] = data["text"].replace("\ufeff", "") + data["text"] = " ".join(data["text"].split()) + manifest_lines_batch.append(data) if line_i == end: break @@ -91,42 +108,138 @@ def get_char_tokens(text, model): return tokens -def get_y_and_boundary_info_for_utt(text, model, separator): +def is_sub_or_superscript_pair(ref_text, text): + """returns True if ref_text is a subscript or superscript version of text""" + sub_or_superscript_to_num = { + "⁰": "0", + "¹": "1", + "²": "2", + "³": "3", + "⁴": "4", + "⁵": "5", + "⁶": "6", + "⁷": "7", + "⁸": "8", + "⁹": "9", + "₀": "0", + "₁": "1", + "₂": "2", + "₃": "3", + "₄": "4", + "₅": "5", + "₆": "6", + "₇": "7", + "₈": "8", + "₉": "9", + } + + if text in sub_or_superscript_to_num: + if sub_or_superscript_to_num[text] == ref_text: + return True + return False + + +def restore_token_case(word, word_tokens): + + # remove repeated "▁" and "_" from word as that is what the tokenizer will do + while "▁▁" in word: + word = word.replace("▁▁", "▁") + + while "__" in word: + word = word.repalce("__", "_") + + word_tokens_cased = [] + word_char_pointer = 0 + + for token in word_tokens: + token_cased = "" + + for token_char in token: + if token_char == word[word_char_pointer]: + token_cased += token_char + word_char_pointer += 1 + + else: + if token_char.upper() == word[word_char_pointer] or is_sub_or_superscript_pair( + token_char, word[word_char_pointer] + ): + token_cased += token_char.upper() + word_char_pointer += 1 + else: + if token_char == "▁" or token_char == "_": + if word[word_char_pointer] == "▁" or word[word_char_pointer] == "_": + token_cased += token_char + word_char_pointer += 1 + elif word_char_pointer == 0: + token_cased += token_char + + else: + raise RuntimeError( + f"Unexpected error - failed to recover capitalization of tokens for word {word}" + ) + + word_tokens_cased.append(token_cased) + + return word_tokens_cased + + +@dataclass +class Token: + text: str = None + text_cased: str = None + s_start: int = None + s_end: int = None + t_start: float = None + t_end: float = None + + +@dataclass +class Word: + text: str = None + s_start: int = None + s_end: int = None + t_start: float = None + t_end: float = None + tokens: List[Token] = field(default_factory=list) + + +@dataclass +class Segment: + text: str = None + s_start: int = None + s_end: int = None + t_start: float = None + t_end: float = None + words_and_tokens: List[Union[Word, Token]] = field(default_factory=list) + + +@dataclass +class Utterance: + token_ids_with_blanks: List[int] = field(default_factory=list) + segments_and_tokens: List[Union[Segment, Token]] = field(default_factory=list) + text: str = None + pred_text: str = None + audio_filepath: str = None + utt_id: str = None + saved_output_files: dict = field(default_factory=dict) + + +def get_utt_obj( + text, model, separator, T, audio_filepath, utt_id, +): """ - Get y_token_ids_with_blanks, token_info, word_info and segment_info for the text provided, tokenized - by the model provided. - y_token_ids_with_blanks is a list of the indices of the text tokens with the blank token id in between every - text token. - token_info, word_info and segment_info are lists of dictionaries containing information about - where the tokens/words/segments start and end. - For example, 'hi world | hey ' with separator = '|' and tokenized by a BPE tokenizer can have token_info like: - token_info = [ - {'text': '', 's_start': 0, 's_end': 0}, - {'text': '▁hi', 's_start': 1, 's_end': 1}, - {'text': '', 's_start': 2, 's_end': 2}, - {'text': '▁world', 's_start': 3, 's_end': 3}, - {'text': '', 's_start': 4, 's_end': 4}, - {'text': '▁he', 's_start': 5, 's_end': 5}, - {'text': '', 's_start': 6, 's_end': 6}, - {'text': 'y', 's_start': 7, 's_end': 7}, - {'text': '', 's_start': 8, 's_end': 8}, - ] - 's_start' and 's_end' indicate where in the sequence of tokens does each token start and end. - - The word_info will be as follows: - word_info = [ - {'text': 'hi', 's_start': 1, 's_end': 1}, - {'text': 'world', 's_start': 3, 's_end': 3}, - {'text': 'hey', 's_start': 5, 's_end': 7}, - ] - 's_start' and 's_end' indicate where in the sequence of tokens does each word start and end. - - segment_info will be as follows: - segment_info = [ - {'text': 'hi world', 's_start': 1, 's_end': 3}, - {'text': 'hey', 's_start': 5, 's_end': 7}, - ] - 's_start' and 's_end' indicate where in the sequence of tokens does each segment start and end. + Function to create an Utterance object and add all necessary information to it except + for timings of the segments / words / tokens according to the alignment - that will + be done later in a different function, after the alignment is done. + + The Utterance object has a list segments_and_tokens which contains Segment objects and + Token objects (for blank tokens in between segments). + Within the Segment objects, there is a list words_and_tokens which contains Word objects and + Token objects (for blank tokens in between words). + Within the Word objects, there is a list tokens tokens which contains Token objects for + blank and non-blank tokens. + We will be building up these lists in this function. This data structure will then be useful for + generating the various output files that we wish to save. """ if not separator: # if separator is not defined - treat the whole text as one segment @@ -137,157 +250,429 @@ def get_y_and_boundary_info_for_utt(text, model, separator): # remove any spaces at start and end of segments segments = [seg.strip() for seg in segments] + utt = Utterance(text=text, audio_filepath=audio_filepath, utt_id=utt_id,) + + # build up lists: token_ids_with_blanks, segments_and_tokens. + # The code for these is different depending on whether we use char-based tokens or not if hasattr(model, 'tokenizer'): if hasattr(model, 'blank_id'): BLANK_ID = model.blank_id else: - BLANK_ID = len(model.decoder.vocabulary) # TODO: check + BLANK_ID = len(model.tokenizer.vocab) # TODO: check - y_token_ids_with_blanks = [BLANK_ID] - token_info = [{"text": BLANK_TOKEN, "s_start": 0, "s_end": 0,}] - word_info = [] - segment_info = [] + utt.token_ids_with_blanks = [BLANK_ID] + + # check for text being 0 length + if len(text) == 0: + return utt + + # check for # tokens + token repetitions being > T + all_tokens = model.tokenizer.text_to_ids(text) + n_token_repetitions = 0 + for i_tok in range(1, len(all_tokens)): + if all_tokens[i_tok] == all_tokens[i_tok - 1]: + n_token_repetitions += 1 + + if len(all_tokens) + n_token_repetitions > T: + logging.info( + f"Utterance {utt_id} has too many tokens compared to the audio file duration." + " Will not generate output alignment files for this utterance." + ) + return utt + + # build up data structures containing segments/words/tokens + utt.segments_and_tokens.append(Token(text=BLANK_TOKEN, text_cased=BLANK_TOKEN, s_start=0, s_end=0,)) segment_s_pointer = 1 # first segment will start at s=1 because s=0 is a blank word_s_pointer = 1 # first word will start at s=1 because s=0 is a blank for segment in segments: + # add the segment to segment_info and increment the segment_s_pointer + segment_tokens = model.tokenizer.text_to_tokens(segment) + utt.segments_and_tokens.append( + Segment( + text=segment, + s_start=segment_s_pointer, + # segment_tokens do not contain blanks => need to muliply by 2 + # s_end needs to be the index of the final token (including blanks) of the current segment: + # segment_s_pointer + len(segment_tokens) * 2 is the index of the first token of the next segment => + # => need to subtract 2 + s_end=segment_s_pointer + len(segment_tokens) * 2 - 2, + ) + ) + segment_s_pointer += ( + len(segment_tokens) * 2 + ) # multiply by 2 to account for blanks (which are not present in segment_tokens) + words = segment.split(" ") # we define words to be space-separated sub-strings - for word in words: + for word_i, word in enumerate(words): word_tokens = model.tokenizer.text_to_tokens(word) - word_ids = model.tokenizer.text_to_ids(word) - for token, id_ in zip(word_tokens, word_ids): - # add the text token and the blank that follows it - # to our token-based variables - y_token_ids_with_blanks.extend([id_, BLANK_ID]) - token_info.extend( - [ - { - "text": token, - "s_start": len(y_token_ids_with_blanks) - 2, - "s_end": len(y_token_ids_with_blanks) - 2, - }, - { - "text": BLANK_TOKEN, - "s_start": len(y_token_ids_with_blanks) - 1, - "s_end": len(y_token_ids_with_blanks) - 1, - }, - ] - ) + word_token_ids = model.tokenizer.text_to_ids(word) + word_tokens_cased = restore_token_case(word, word_tokens) # add the word to word_info and increment the word_s_pointer - word_info.append( - { - "text": word, - "s_start": word_s_pointer, - "s_end": word_s_pointer + (len(word_tokens) - 1) * 2, # TODO check this, - } + utt.segments_and_tokens[-1].words_and_tokens.append( + # word_tokens do not contain blanks => need to muliply by 2 + # s_end needs to be the index of the final token (including blanks) of the current word: + # word_s_pointer + len(word_tokens) * 2 is the index of the first token of the next word => + # => need to subtract 2 + Word(text=word, s_start=word_s_pointer, s_end=word_s_pointer + len(word_tokens) * 2 - 2) ) - word_s_pointer += len(word_tokens) * 2 # TODO check this + word_s_pointer += ( + len(word_tokens) * 2 + ) # multiply by 2 to account for blanks (which are not present in word_tokens) + + for token_i, (token, token_id, token_cased) in enumerate( + zip(word_tokens, word_token_ids, word_tokens_cased) + ): + # add the text tokens and the blanks in between them + # to our token-based variables + utt.token_ids_with_blanks.extend([token_id, BLANK_ID]) + # adding Token object for non-blank token + utt.segments_and_tokens[-1].words_and_tokens[-1].tokens.append( + Token( + text=token, + text_cased=token_cased, + # utt.token_ids_with_blanks has the form [...., , ] => + # => if do len(utt.token_ids_with_blanks) - 1 you get the index of the final + # => we want to do len(utt.token_ids_with_blanks) - 2 to get the index of + s_start=len(utt.token_ids_with_blanks) - 2, + # s_end is same as s_start since the token only occupies one element in the list + s_end=len(utt.token_ids_with_blanks) - 2, + ) + ) - # add the segment to segment_info and increment the segment_s_pointer - segment_tokens = model.tokenizer.text_to_tokens(segment) - segment_info.append( - { - "text": segment, - "s_start": segment_s_pointer, - "s_end": segment_s_pointer + (len(segment_tokens) - 1) * 2, - } + # adding Token object for blank tokens in between the tokens of the word + # (ie do not add another blank if you have reached the end) + if token_i < len(word_tokens) - 1: + utt.segments_and_tokens[-1].words_and_tokens[-1].tokens.append( + Token( + text=BLANK_TOKEN, + text_cased=BLANK_TOKEN, + # utt.token_ids_with_blanks has the form [...., ] => + # => if do len(utt.token_ids_with_blanks) -1 you get the index of this + s_start=len(utt.token_ids_with_blanks) - 1, + # s_end is same as s_start since the token only occupies one element in the list + s_end=len(utt.token_ids_with_blanks) - 1, + ) + ) + + # add a Token object for blanks in between words in this segment + # (but only *in between* - do not add the token if it is after the final word) + if word_i < len(words) - 1: + utt.segments_and_tokens[-1].words_and_tokens.append( + Token( + text=BLANK_TOKEN, + text_cased=BLANK_TOKEN, + # utt.token_ids_with_blanks has the form [...., ] => + # => if do len(utt.token_ids_with_blanks) -1 you get the index of this + s_start=len(utt.token_ids_with_blanks) - 1, + # s_end is same as s_start since the token only occupies one element in the list + s_end=len(utt.token_ids_with_blanks) - 1, + ) + ) + + # add the blank token in between segments/after the final segment + utt.segments_and_tokens.append( + Token( + text=BLANK_TOKEN, + text_cased=BLANK_TOKEN, + # utt.token_ids_with_blanks has the form [...., ] => + # => if do len(utt.token_ids_with_blanks) -1 you get the index of this + s_start=len(utt.token_ids_with_blanks) - 1, + # s_end is same as s_start since the token only occupies one element in the list + s_end=len(utt.token_ids_with_blanks) - 1, + ) ) - segment_s_pointer += len(segment_tokens) * 2 - return y_token_ids_with_blanks, token_info, word_info, segment_info + return utt elif hasattr(model.decoder, "vocabulary"): # i.e. tokenization is simply character-based BLANK_ID = len(model.decoder.vocabulary) # TODO: check this is correct SPACE_ID = model.decoder.vocabulary.index(" ") - y_token_ids_with_blanks = [BLANK_ID] - token_info = [{"text": BLANK_TOKEN, "s_start": 0, "s_end": 0,}] - word_info = [] - segment_info = [] + utt.token_ids_with_blanks = [BLANK_ID] + + # check for text being 0 length + if len(text) == 0: + return utt + + # check for # tokens + token repetitions being > T + all_tokens = get_char_tokens(text, model) + n_token_repetitions = 0 + for i_tok in range(1, len(all_tokens)): + if all_tokens[i_tok] == all_tokens[i_tok - 1]: + n_token_repetitions += 1 + + if len(all_tokens) + n_token_repetitions > T: + logging.info( + f"Utterance {utt_id} has too many tokens compared to the audio file duration." + " Will not generate output alignment files for this utterance." + ) + return utt + + # build up data structures containing segments/words/tokens + utt.segments_and_tokens.append(Token(text=BLANK_TOKEN, text_cased=BLANK_TOKEN, s_start=0, s_end=0,)) segment_s_pointer = 1 # first segment will start at s=1 because s=0 is a blank word_s_pointer = 1 # first word will start at s=1 because s=0 is a blank for i_segment, segment in enumerate(segments): - words = segment.split(" ") # we define words to be space-separated characters + # add the segment to segment_info and increment the segment_s_pointer + segment_tokens = get_char_tokens(segment, model) + utt.segments_and_tokens.append( + Segment( + text=segment, + s_start=segment_s_pointer, + # segment_tokens do not contain blanks => need to muliply by 2 + # s_end needs to be the index of the final token (including blanks) of the current segment: + # segment_s_pointer + len(segment_tokens) * 2 is the index of the first token of the next segment => + # => need to subtract 2 + s_end=segment_s_pointer + len(segment_tokens) * 2 - 2, + ) + ) + + # for correct calculation: multiply len(segment_tokens) by 2 to account for blanks (which are not present in segment_tokens) + # and + 2 to account for [, ] + segment_s_pointer += len(segment_tokens) * 2 + 2 + + words = segment.split(" ") # we define words to be space-separated substrings for i_word, word in enumerate(words): # convert string to list of characters word_tokens = list(word) # convert list of characters to list of their ids in the vocabulary - word_ids = get_char_tokens(word, model) - for token, id_ in zip(word_tokens, word_ids): - # add the text token and the blank that follows it + word_token_ids = get_char_tokens(word, model) + + # add the word to word_info and increment the word_s_pointer + utt.segments_and_tokens[-1].words_and_tokens.append( + # note for s_end: + # word_tokens do not contain blanks => need to muliply by 2 + # s_end needs to be the index of the final token (including blanks) of the current word: + # word_s_pointer + len(word_tokens) * 2 is the index of the first token of the next word => + # => need to subtract 2 + Word(text=word, s_start=word_s_pointer, s_end=word_s_pointer + len(word_tokens) * 2 - 2) + ) + + # for correct calculation: multiply len(word_tokens) by 2 to account for blanks (which are not present in word_tokens) + # and + 2 to account for [, ] + word_s_pointer += len(word_tokens) * 2 + 2 + + for token_i, (token, token_id) in enumerate(zip(word_tokens, word_token_ids)): + # add the text tokens and the blanks in between them # to our token-based variables - y_token_ids_with_blanks.extend([id_, BLANK_ID]) - token_info.extend( - [ - { - "text": token, - "s_start": len(y_token_ids_with_blanks) - 2, - "s_end": len(y_token_ids_with_blanks) - 2, - }, - { - "text": BLANK_TOKEN, - "s_start": len(y_token_ids_with_blanks) - 1, - "s_end": len(y_token_ids_with_blanks) - 1, - }, - ] + utt.token_ids_with_blanks.extend([token_id]) + utt.segments_and_tokens[-1].words_and_tokens[-1].tokens.append( + Token( + text=token, + text_cased=token, + # utt.token_ids_with_blanks has the form [..., ] + # => do len(utt.token_ids_with_blanks) - 1 to get the index of this non-blank token + s_start=len(utt.token_ids_with_blanks) - 1, + # s_end is same as s_start since the token only occupies one element in the list + s_end=len(utt.token_ids_with_blanks) - 1, + ) ) - # add space token (and the blank after it) unless this is the final word in the final segment - if not (i_segment == len(segments) - 1 and i_word == len(words) - 1): - y_token_ids_with_blanks.extend([SPACE_ID, BLANK_ID]) - token_info.extend( - ( - { - "text": SPACE_TOKEN, - "s_start": len(y_token_ids_with_blanks) - 2, - "s_end": len(y_token_ids_with_blanks) - 2, - }, - { - "text": BLANK_TOKEN, - "s_start": len(y_token_ids_with_blanks) - 1, - "s_end": len(y_token_ids_with_blanks) - 1, - }, + if token_i < len(word_tokens) - 1: # only add blank tokens that are in the middle of words + utt.token_ids_with_blanks.extend([BLANK_ID]) + utt.segments_and_tokens[-1].words_and_tokens[-1].tokens.append( + Token( + text=BLANK_TOKEN, + text_cased=BLANK_TOKEN, + # utt.token_ids_with_blanks has the form [..., ] + # => do len(utt.token_ids_with_blanks) - 1 to get the index of this blank token + s_start=len(utt.token_ids_with_blanks) - 1, + # s_end is same as s_start since the token only occupies one element in the list + s_end=len(utt.token_ids_with_blanks) - 1, + ) + ) + + # add space token (and the blanks around it) unless this is the final word in a segment + if i_word < len(words) - 1: + utt.token_ids_with_blanks.extend([BLANK_ID, SPACE_ID, BLANK_ID]) + utt.segments_and_tokens[-1].words_and_tokens.append( + Token( + text=BLANK_TOKEN, + text_cased=BLANK_TOKEN, + # utt.token_ids_with_blanks has the form + # [..., , , , ] + # => do len(utt.token_ids_with_blanks) - 3 to get the index of the blank token before the space token + s_start=len(utt.token_ids_with_blanks) - 3, + # s_end is same as s_start since the token only occupies one element in the list + s_end=len(utt.token_ids_with_blanks) - 3, + ) + ) + utt.segments_and_tokens[-1].words_and_tokens.append( + Token( + text=SPACE_TOKEN, + text_cased=SPACE_TOKEN, + # utt.token_ids_with_blanks has the form + # [..., , , , ] + # => do len(utt.token_ids_with_blanks) - 2 to get the index of the space token + s_start=len(utt.token_ids_with_blanks) - 2, + # s_end is same as s_start since the token only occupies one element in the list + s_end=len(utt.token_ids_with_blanks) - 2, + ) + ) + utt.segments_and_tokens[-1].words_and_tokens.append( + Token( + text=BLANK_TOKEN, + text_cased=BLANK_TOKEN, + # utt.token_ids_with_blanks has the form + # [..., , , , ] + # => do len(utt.token_ids_with_blanks) - 1 to get the index of the blank token after the space token + s_start=len(utt.token_ids_with_blanks) - 1, + # s_end is same as s_start since the token only occupies one element in the list + s_end=len(utt.token_ids_with_blanks) - 1, ) ) - # add the word to word_info and increment the word_s_pointer - word_info.append( - { - "text": word, - "s_start": word_s_pointer, - "s_end": word_s_pointer + len(word_tokens) * 2 - 2, # TODO check this, - } - ) - word_s_pointer += len(word_tokens) * 2 + 2 # TODO check this - # add the segment to segment_info and increment the segment_s_pointer - segment_tokens = get_char_tokens(segment, model) - segment_info.append( - { - "text": segment, - "s_start": segment_s_pointer, - "s_end": segment_s_pointer + (len(segment_tokens) - 1) * 2, - } + # add a blank to the segment, and add a space after if this is not the final segment + utt.token_ids_with_blanks.extend([BLANK_ID]) + utt.segments_and_tokens.append( + Token( + text=BLANK_TOKEN, + text_cased=BLANK_TOKEN, + # utt.token_ids_with_blanks has the form [..., ] + # => do len(utt.token_ids_with_blanks) - 1 to get the index of this blank token + s_start=len(utt.token_ids_with_blanks) - 1, + # s_end is same as s_start since the token only occupies one element in the list + s_end=len(utt.token_ids_with_blanks) - 1, + ) ) - segment_s_pointer += len(segment_tokens) * 2 + 2 - return y_token_ids_with_blanks, token_info, word_info, segment_info + if i_segment < len(segments) - 1: + utt.token_ids_with_blanks.extend([SPACE_ID, BLANK_ID]) + utt.segments_and_tokens.append( + Token( + text=SPACE_TOKEN, + text_cased=SPACE_TOKEN, + # utt.token_ids_with_blanks has the form + # [..., , ] + # => do len(utt.token_ids_with_blanks) - 2 to get the index of the space token + s_start=len(utt.token_ids_with_blanks) - 2, + # s_end is same as s_start since the token only occupies one element in the list + s_end=len(utt.token_ids_with_blanks) - 2, + ) + ) + utt.segments_and_tokens.append( + Token( + text=BLANK_TOKEN, + text_cased=BLANK_TOKEN, + # utt.token_ids_with_blanks has the form + # [..., , ] + # => do len(utt.token_ids_with_blanks) - 1 to get the index of the blank token + s_start=len(utt.token_ids_with_blanks) - 1, + # s_end is same as s_start since the token only occupies one element in the list + s_end=len(utt.token_ids_with_blanks) - 1, + ) + ) + + return utt else: raise RuntimeError("Cannot get tokens of this model.") -def get_batch_tensors_and_boundary_info( +def add_t_start_end_to_utt_obj(utt_obj, alignment_utt, output_timestep_duration): + """ + Function to add t_start and t_end (representing time in seconds) to the Utterance object utt_obj. + Args: + utt_obj: Utterance object to which we will add t_start and t_end for its + constituent segments/words/tokens. + alignment_utt: a list of ints indicating which token does the alignment pass through at each + timestep (will take the form [0, 0, 1, 1, ..., ]). + output_timestep_duration: a float indicating the duration of a single output timestep from + the ASR Model. + + Returns: + utt_obj: updated Utterance object. + """ + + # General idea for the algorithm of how we add t_start and t_end + # the timestep where a token s starts is the location of the first appearance of s_start in alignment_utt + # the timestep where a token s ends is the location of the final appearance of s_end in alignment_utt + # We will make dictionaries num_to_first_alignment_appearance and + # num_to_last_appearance and use that to update all of + # the t_start and t_end values in utt_obj. + # We will put t_start = t_end = -1 for tokens that are skipped (should only be blanks) + + num_to_first_alignment_appearance = dict() + num_to_last_alignment_appearance = dict() + + prev_s = -1 # use prev_s to keep track of when the s changes + for t, s in enumerate(alignment_utt): + if s > prev_s: + num_to_first_alignment_appearance[s] = t + + if prev_s >= 0: # dont record prev_s = -1 + num_to_last_alignment_appearance[prev_s] = t - 1 + prev_s = s + # add last appearance of the final s + num_to_last_alignment_appearance[prev_s] = len(alignment_utt) - 1 + + # update all the t_start and t_end in utt_obj + for segment_or_token in utt_obj.segments_and_tokens: + if type(segment_or_token) is Segment: + segment = segment_or_token + segment.t_start = num_to_first_alignment_appearance[segment.s_start] * output_timestep_duration + segment.t_end = (num_to_last_alignment_appearance[segment.s_end] + 1) * output_timestep_duration + + for word_or_token in segment.words_and_tokens: + if type(word_or_token) is Word: + word = word_or_token + word.t_start = num_to_first_alignment_appearance[word.s_start] * output_timestep_duration + word.t_end = (num_to_last_alignment_appearance[word.s_end] + 1) * output_timestep_duration + + for token in word.tokens: + if token.s_start in num_to_first_alignment_appearance: + token.t_start = num_to_first_alignment_appearance[token.s_start] * output_timestep_duration + else: + token.t_start = -1 + + if token.s_end in num_to_last_alignment_appearance: + token.t_end = ( + num_to_last_alignment_appearance[token.s_end] + 1 + ) * output_timestep_duration + else: + token.t_end = -1 + else: + token = word_or_token + if token.s_start in num_to_first_alignment_appearance: + token.t_start = num_to_first_alignment_appearance[token.s_start] * output_timestep_duration + else: + token.t_start = -1 + + if token.s_end in num_to_last_alignment_appearance: + token.t_end = (num_to_last_alignment_appearance[token.s_end] + 1) * output_timestep_duration + else: + token.t_end = -1 + + else: + token = segment_or_token + if token.s_start in num_to_first_alignment_appearance: + token.t_start = num_to_first_alignment_appearance[token.s_start] * output_timestep_duration + else: + token.t_start = -1 + + if token.s_end in num_to_last_alignment_appearance: + token.t_end = (num_to_last_alignment_appearance[token.s_end] + 1) * output_timestep_duration + else: + token.t_end = -1 + + return utt_obj + + +def get_batch_variables( manifest_lines_batch, model, separator, align_using_pred_text, + audio_filepath_parts_in_utt_id, + output_timestep_duration, simulate_cache_aware_streaming=False, use_buffered_chunked_streaming=False, buffered_chunk_params={}, @@ -296,10 +681,9 @@ def get_batch_tensors_and_boundary_info( Returns: log_probs, y, T, U (y and U are s.t. every other token is a blank) - these are the tensors we will need during Viterbi decoding. - token_info_list, word_info_list, segment_info_list - these are lists of dictionaries which we will need - for writing the CTM files with the human-readable alignments. - pred_text_list - this is a list of the transcriptions from our model which we will save to our output JSON - file if align_using_pred_text is True. + utt_obj_batch: a list of Utterance objects for every utterance in the batch. + output_timestep_duration: a float indicating the duration of a single output timestep from + the ASR Model. """ # get hypotheses by calling 'transcribe' @@ -320,6 +704,11 @@ def get_batch_tensors_and_boundary_info( hypotheses = model.transcribe_simulate_cache_aware_streaming( audio_filepaths_batch, return_hypotheses=True, batch_size=B ) + + # if hypotheses form a tuple (from Hybrid model), extract just "best" hypothesis + if type(hypotheses) == tuple and len(hypotheses) == 2: + hypotheses = hypotheses[0] + for hypothesis in hypotheses: log_probs_list_batch.append(hypothesis.y_sequence) T_list_batch.append(hypothesis.y_sequence.shape[0]) @@ -341,30 +730,52 @@ def get_batch_tensors_and_boundary_info( # token_info_batch, word_info_batch, segment_info_batch y_list_batch = [] U_list_batch = [] - token_info_batch = [] - word_info_batch = [] - segment_info_batch = [] + utt_obj_batch = [] for i_line, line in enumerate(manifest_lines_batch): if align_using_pred_text: - gt_text_for_alignment = pred_text_batch[i_line] + gt_text_for_alignment = " ".join(pred_text_batch[i_line].split()) else: gt_text_for_alignment = line["text"] - y_utt, token_info_utt, word_info_utt, segment_info_utt = get_y_and_boundary_info_for_utt( - gt_text_for_alignment, model, separator + utt_obj = get_utt_obj( + gt_text_for_alignment, + model, + separator, + T_list_batch[i_line], + audio_filepaths_batch[i_line], + _get_utt_id(audio_filepaths_batch[i_line], audio_filepath_parts_in_utt_id), ) - y_list_batch.append(y_utt) - U_list_batch.append(len(y_utt)) - token_info_batch.append(token_info_utt) - word_info_batch.append(word_info_utt) - segment_info_batch.append(segment_info_utt) + # update utt_obj.pred_text or utt_obj.text + if align_using_pred_text: + utt_obj.pred_text = pred_text_batch[i_line] + if len(utt_obj.pred_text) == 0: + logging.info( + f"'pred_text' of utterance {utt_obj.utt_id} is empty - we will not generate" + " any output alignment files for this utterance" + ) + if "text" in line: + utt_obj.text = line["text"] # keep the text as we will save it in the output manifest + else: + utt_obj.text = line["text"] + if len(utt_obj.text) == 0: + logging.info( + f"'text' of utterance {utt_obj.utt_id} is empty - we will not generate" + " any output alignment files for this utterance" + ) + + y_list_batch.append(utt_obj.token_ids_with_blanks) + U_list_batch.append(len(utt_obj.token_ids_with_blanks)) + utt_obj_batch.append(utt_obj) # turn log_probs, y, T, U into dense tensors for fast computation during Viterbi decoding T_max = max(T_list_batch) U_max = max(U_list_batch) # V = the number of tokens in the vocabulary + 1 for the blank token. - V = len(model.decoder.vocabulary) + 1 + if hasattr(model, 'tokenizer'): + V = len(model.tokenizer.vocab) + 1 + else: + V = len(model.decoder.vocabulary) + 1 T_batch = torch.tensor(T_list_batch) U_batch = torch.tensor(U_list_batch) @@ -383,13 +794,40 @@ def get_batch_tensors_and_boundary_info( U_utt = U_batch[b] y_batch[b, :U_utt] = torch.tensor(y_utt) + # calculate output_timestep_duration if it is None + if output_timestep_duration is None: + if not 'window_stride' in model.cfg.preprocessor: + raise ValueError( + "Don't have attribute 'window_stride' in 'model.cfg.preprocessor' => cannot calculate " + " model_downsample_factor => stopping process" + ) + + if not 'sample_rate' in model.cfg.preprocessor: + raise ValueError( + "Don't have attribute 'sample_rate' in 'model.cfg.preprocessor' => cannot calculate start " + " and end time of segments => stopping process" + ) + + with sf.SoundFile(audio_filepaths_batch[0]) as f: + audio_dur = f.frames / f.samplerate + n_input_frames = audio_dur / model.cfg.preprocessor.window_stride + model_downsample_factor = round(n_input_frames / int(T_batch[0])) + + output_timestep_duration = ( + model.preprocessor.featurizer.hop_length * model_downsample_factor / model.cfg.preprocessor.sample_rate + ) + + logging.info( + f"Calculated that the model downsample factor is {model_downsample_factor}" + f" and therefore the ASR model output timestep duration is {output_timestep_duration}" + " -- will use this for all batches" + ) + return ( log_probs_batch, y_batch, T_batch, U_batch, - token_info_batch, - word_info_batch, - segment_info_batch, - pred_text_batch, + utt_obj_batch, + output_timestep_duration, ) diff --git a/tools/nemo_forced_aligner/utils/make_ass_files.py b/tools/nemo_forced_aligner/utils/make_ass_files.py new file mode 100644 index 000000000000..f1beea838573 --- /dev/null +++ b/tools/nemo_forced_aligner/utils/make_ass_files.py @@ -0,0 +1,428 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This file contains functions for make ASS-format subtitle files based on the generated alignment. +ASS files can be generated highlighting token-level alignments or word-level alignments. +In both cases, 'segment' boundaries will be used to determine which parts of the text will appear +at the same time. +For the token-level ASS files, the text will be highlighted token-by-token, with the timings determined +by the NFA alignments. +For the word-level ASS files, the text will be highlighted word-by-word, with the timings determined +by the NFA alignemtns. +""" + +import os + +from utils.constants import BLANK_TOKEN, SPACE_TOKEN +from utils.data_prep import Segment, Token, Word + +PLAYERRESX = 384 +PLAYERRESY = 288 +MARGINL = 10 +MARGINR = 10 + + +def seconds_to_ass_format(seconds_float): + seconds_float = float(seconds_float) + mm, ss_decimals = divmod(seconds_float, 60) + hh, mm = divmod(mm, 60) + + hh = str(round(hh)) + if len(hh) == 1: + hh = '0' + hh + + mm = str(round(mm)) + if len(mm) == 1: + mm = '0' + mm + + ss_decimals = f"{ss_decimals:.2f}" + if len(ss_decimals.split(".")[0]) == 1: + ss_decimals = "0" + ss_decimals + + srt_format_time = f"{hh}:{mm}:{ss_decimals}" + + return srt_format_time + + +def make_ass_files( + utt_obj, output_dir_root, ass_file_config, +): + + # don't try to make files if utt_obj.segments_and_tokens is empty, which will happen + # in the case of the ground truth text being empty or the number of tokens being too large vs audio duration + if not utt_obj.segments_and_tokens: + return utt_obj + + if ass_file_config.resegment_text_to_fill_space: + utt_obj = resegment_utt_obj(utt_obj, ass_file_config) + + utt_obj = make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config,) + utt_obj = make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config,) + + return utt_obj + + +def _get_word_n_chars(word): + n_chars = 0 + for token in word.tokens: + if token.text != BLANK_TOKEN: + n_chars += len(token.text) + return n_chars + + +def _get_segment_n_chars(segment): + n_chars = 0 + for word_or_token in segment.words_and_tokens: + if word_or_token.text == SPACE_TOKEN: + n_chars += 1 + elif word_or_token.text != BLANK_TOKEN: + n_chars += len(word_or_token.text) + return n_chars + + +def resegment_utt_obj(utt_obj, ass_file_config): + + # get list of just all words and tokens + all_words_and_tokens = [] + for segment_or_token in utt_obj.segments_and_tokens: + if type(segment_or_token) is Segment: + all_words_and_tokens.extend(segment_or_token.words_and_tokens) + else: + all_words_and_tokens.append(segment_or_token) + + # figure out how many chars will fit into one 'slide' and thus should be the max + # size of a segment + approx_chars_per_line = (PLAYERRESX - MARGINL - MARGINR) / ( + ass_file_config.fontsize * 0.6 + ) # assume chars 0.6 as wide as they are tall + approx_lines_per_segment = (PLAYERRESY - ass_file_config.marginv) / ( + ass_file_config.fontsize * 1.15 + ) # assume line spacing is 1.15 + if approx_lines_per_segment > ass_file_config.max_lines_per_segment: + approx_lines_per_segment = ass_file_config.max_lines_per_segment + + max_chars_per_segment = int(approx_chars_per_line * approx_lines_per_segment) + + new_segments_and_tokens = [] + all_words_and_tokens_pointer = 0 + for word_or_token in all_words_and_tokens: + if type(word_or_token) is Token: + new_segments_and_tokens.append(word_or_token) + all_words_and_tokens_pointer += 1 + else: + break + + new_segments_and_tokens.append(Segment()) + + while all_words_and_tokens_pointer < len(all_words_and_tokens): + word_or_token = all_words_and_tokens[all_words_and_tokens_pointer] + if type(word_or_token) is Word: + + # if this is going to be the first word in the segment, we definitely want + # to add it to the segment + if not new_segments_and_tokens[-1].words_and_tokens: + new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) + + else: + # if not the first word, check what the new length of the segment will be + # if short enough - add this word to this segment; + # if too long - add to a new segment + this_word_n_chars = _get_word_n_chars(word_or_token) + segment_so_far_n_chars = _get_segment_n_chars(new_segments_and_tokens[-1]) + if this_word_n_chars + segment_so_far_n_chars < max_chars_per_segment: + new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) + else: + new_segments_and_tokens.append(Segment()) + new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) + + else: # i.e. word_or_token is a token + # currently this breaks the convention of tokens at the end/beginning + # of segments being listed as separate tokens in segment.word_and_tokens + # TODO: change code so we follow this convention + new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) + + all_words_and_tokens_pointer += 1 + + utt_obj.segments_and_tokens = new_segments_and_tokens + + return utt_obj + + +def make_word_level_ass_file( + utt_obj, output_dir_root, ass_file_config, +): + + default_style_dict = { + "Name": "Default", + "Fontname": "Arial", + "Fontsize": str(ass_file_config.fontsize), + "PrimaryColour": "&Hffffff", + "SecondaryColour": "&Hffffff", + "OutlineColour": "&H0", + "BackColour": "&H0", + "Bold": "0", + "Italic": "0", + "Underline": "0", + "StrikeOut": "0", + "ScaleX": "100", + "ScaleY": "100", + "Spacing": "0", + "Angle": "0", + "BorderStyle": "1", + "Outline": "1", + "Shadow": "0", + "Alignment": "2", + "MarginL": str(MARGINL), + "MarginR": str(MARGINR), + "MarginV": str(ass_file_config.marginv), + "Encoding": "0", + } + + output_dir = os.path.join(output_dir_root, "ass", "words") + os.makedirs(output_dir, exist_ok=True) + output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass") + + with open(output_file, 'w') as f: + default_style_top_line = "Format: " + ", ".join(default_style_dict.keys()) + default_style_bottom_line = "Style: " + ",".join(default_style_dict.values()) + + f.write( + ( + "[Script Info]\n" + "ScriptType: v4.00+\n" + f"PlayResX: {PLAYERRESX}\n" + f"PlayResY: {PLAYERRESY}\n" + "\n" + "[V4+ Styles]\n" + f"{default_style_top_line}\n" + f"{default_style_bottom_line}\n" + "\n" + "[Events]\n" + "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n" + ) + ) + + # write first set of subtitles for text before speech starts to be spoken + words_in_first_segment = [] + for segment_or_token in utt_obj.segments_and_tokens: + if type(segment_or_token) is Segment: + first_segment = segment_or_token + + for word_or_token in first_segment.words_and_tokens: + if type(word_or_token) is Word: + words_in_first_segment.append(word_or_token) + break + + text_before_speech = r"{\c&c7c1c2&}" + " ".join([x.text for x in words_in_first_segment]) + r"{\r}" + subtitle_text = ( + f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(words_in_first_segment[0].t_start)},Default,,0,0,0,," + + text_before_speech.rstrip() + ) + + f.write(subtitle_text + '\n') + + for segment_or_token in utt_obj.segments_and_tokens: + if type(segment_or_token) is Segment: + segment = segment_or_token + + words_in_segment = [] + for word_or_token in segment.words_and_tokens: + if type(word_or_token) is Word: + words_in_segment.append(word_or_token) + + for word_i, word in enumerate(words_in_segment): + + text_before = " ".join([x.text for x in words_in_segment[:word_i]]) + if text_before != "": + text_before += " " + text_before = r"{\c&H3d2e31&}" + text_before + r"{\r}" + + if word_i < len(words_in_segment) - 1: + text_after = " " + " ".join([x.text for x in words_in_segment[word_i + 1 :]]) + else: + text_after = "" + text_after = r"{\c&c7c1c2&}" + text_after + r"{\r}" + + aligned_text = r"{\c&H09ab39&}" + word.text + r"{\r}" + aligned_text_off = r"{\c&H3d2e31&}" + word.text + r"{\r}" + + subtitle_text = ( + f"Dialogue: 0,{seconds_to_ass_format(word.t_start)},{seconds_to_ass_format(word.t_end)},Default,,0,0,0,," + + text_before + + aligned_text + + text_after.rstrip() + ) + f.write(subtitle_text + '\n') + + # add subtitles without word-highlighting for when words are not being spoken + if word_i < len(words_in_segment) - 1: + last_word_end = float(words_in_segment[word_i].t_end) + next_word_start = float(words_in_segment[word_i + 1].t_start) + if next_word_start - last_word_end > 0.001: + subtitle_text = ( + f"Dialogue: 0,{seconds_to_ass_format(last_word_end)},{seconds_to_ass_format(next_word_start)},Default,,0,0,0,," + + text_before + + aligned_text_off + + text_after.rstrip() + ) + f.write(subtitle_text + '\n') + + utt_obj.saved_output_files[f"words_level_ass_filepath"] = output_file + + return utt_obj + + +def make_token_level_ass_file( + utt_obj, output_dir_root, ass_file_config, +): + + default_style_dict = { + "Name": "Default", + "Fontname": "Arial", + "Fontsize": str(ass_file_config.fontsize), + "PrimaryColour": "&Hffffff", + "SecondaryColour": "&Hffffff", + "OutlineColour": "&H0", + "BackColour": "&H0", + "Bold": "0", + "Italic": "0", + "Underline": "0", + "StrikeOut": "0", + "ScaleX": "100", + "ScaleY": "100", + "Spacing": "0", + "Angle": "0", + "BorderStyle": "1", + "Outline": "1", + "Shadow": "0", + "Alignment": "2", + "MarginL": str(MARGINL), + "MarginR": str(MARGINR), + "MarginV": str(ass_file_config.marginv), + "Encoding": "0", + } + + output_dir = os.path.join(output_dir_root, "ass", "tokens") + os.makedirs(output_dir, exist_ok=True) + output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass") + + with open(output_file, 'w') as f: + default_style_top_line = "Format: " + ", ".join(default_style_dict.keys()) + default_style_bottom_line = "Style: " + ",".join(default_style_dict.values()) + + f.write( + ( + "[Script Info]\n" + "ScriptType: v4.00+\n" + f"PlayResX: {PLAYERRESX}\n" + f"PlayResY: {PLAYERRESY}\n" + "ScaledBorderAndShadow: yes\n" + "\n" + "[V4+ Styles]\n" + f"{default_style_top_line}\n" + f"{default_style_bottom_line}\n" + "\n" + "[Events]\n" + "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n" + ) + ) + + # write first set of subtitles for text before speech starts to be spoken + tokens_in_first_segment = [] + for segment_or_token in utt_obj.segments_and_tokens: + if type(segment_or_token) is Segment: + for word_or_token in segment_or_token.words_and_tokens: + if type(word_or_token) is Token: + if word_or_token.text != BLANK_TOKEN: + tokens_in_first_segment.append(word_or_token) + else: + for token in word_or_token.tokens: + if token.text != BLANK_TOKEN: + tokens_in_first_segment.append(token) + + break + + for token in tokens_in_first_segment: + token.text_cased = token.text_cased.replace( + "▁", " " + ) # replace underscores used in subword tokens with spaces + token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space + + text_before_speech = r"{\c&c7c1c2&}" + "".join([x.text_cased for x in tokens_in_first_segment]) + r"{\r}" + subtitle_text = ( + f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(tokens_in_first_segment[0].t_start)},Default,,0,0,0,," + + text_before_speech.rstrip() + ) + + f.write(subtitle_text + '\n') + + for segment_or_token in utt_obj.segments_and_tokens: + if type(segment_or_token) is Segment: + segment = segment_or_token + + tokens_in_segment = [] # make list of (non-blank) tokens + for word_or_token in segment.words_and_tokens: + if type(word_or_token) is Token: + if word_or_token.text != BLANK_TOKEN: + tokens_in_segment.append(word_or_token) + else: + for token in word_or_token.tokens: + if token.text != BLANK_TOKEN: + tokens_in_segment.append(token) + + for token in tokens_in_segment: + token.text_cased = token.text_cased.replace( + "▁", " " + ) # replace underscores used in subword tokens with spaces + token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space + + for token_i, token in enumerate(tokens_in_segment): + + text_before = "".join([x.text_cased for x in tokens_in_segment[:token_i]]) + text_before = r"{\c&H3d2e31&}" + text_before + r"{\r}" + + if token_i < len(tokens_in_segment) - 1: + text_after = "".join([x.text_cased for x in tokens_in_segment[token_i + 1 :]]) + else: + text_after = "" + text_after = r"{\c&c7c1c2&}" + text_after + r"{\r}" + + aligned_text = r"{\c&H09ab39&}" + token.text_cased + r"{\r}" + aligned_text_off = r"{\c&H3d2e31&}" + token.text_cased + r"{\r}" + + subtitle_text = ( + f"Dialogue: 0,{seconds_to_ass_format(token.t_start)},{seconds_to_ass_format(token.t_end)},Default,,0,0,0,," + + text_before + + aligned_text + + text_after.rstrip() + ) + f.write(subtitle_text + '\n') + + # add subtitles without word-highlighting for when words are not being spoken + if token_i < len(tokens_in_segment) - 1: + last_token_end = float(tokens_in_segment[token_i].t_end) + next_token_start = float(tokens_in_segment[token_i + 1].t_start) + if next_token_start - last_token_end > 0.001: + subtitle_text = ( + f"Dialogue: 0,{seconds_to_ass_format(last_token_end)},{seconds_to_ass_format(next_token_start)},Default,,0,0,0,," + + text_before + + aligned_text_off + + text_after.rstrip() + ) + f.write(subtitle_text + '\n') + + utt_obj.saved_output_files[f"tokens_level_ass_filepath"] = output_file + + return utt_obj diff --git a/tools/nemo_forced_aligner/utils/make_ctm_files.py b/tools/nemo_forced_aligner/utils/make_ctm_files.py new file mode 100644 index 000000000000..f0326c07cf8f --- /dev/null +++ b/tools/nemo_forced_aligner/utils/make_ctm_files.py @@ -0,0 +1,114 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import soundfile as sf +from utils.constants import BLANK_TOKEN, SPACE_TOKEN +from utils.data_prep import Segment, Word + + +def make_ctm_files( + utt_obj, output_dir_root, ctm_file_config, +): + """ + Function to save CTM files for all the utterances in the incoming batch. + """ + + # don't try to make files if utt_obj.segments_and_tokens is empty, which will happen + # in the case of the ground truth text being empty or the number of tokens being too large vs audio duration + if not utt_obj.segments_and_tokens: + return utt_obj + + # get audio file duration if we will need it later + if ctm_file_config.minimum_timestamp_duration > 0: + with sf.SoundFile(utt_obj.audio_filepath) as f: + audio_file_duration = f.frames / f.samplerate + else: + audio_file_duration = None + + utt_obj = make_ctm("tokens", utt_obj, output_dir_root, audio_file_duration, ctm_file_config,) + utt_obj = make_ctm("words", utt_obj, output_dir_root, audio_file_duration, ctm_file_config,) + utt_obj = make_ctm("segments", utt_obj, output_dir_root, audio_file_duration, ctm_file_config,) + + return utt_obj + + +def make_ctm( + alignment_level, utt_obj, output_dir_root, audio_file_duration, ctm_file_config, +): + output_dir = os.path.join(output_dir_root, "ctm", alignment_level) + os.makedirs(output_dir, exist_ok=True) + + boundary_info_utt = [] + for segment_or_token in utt_obj.segments_and_tokens: + if type(segment_or_token) is Segment: + segment = segment_or_token + if alignment_level == "segments": + boundary_info_utt.append(segment) + + for word_or_token in segment.words_and_tokens: + if type(word_or_token) is Word: + word = word_or_token + if alignment_level == "words": + boundary_info_utt.append(word) + + for token in word.tokens: + if alignment_level == "tokens": + boundary_info_utt.append(token) + + else: + token = word_or_token + if alignment_level == "tokens": + boundary_info_utt.append(token) + + else: + token = segment_or_token + if alignment_level == "tokens": + boundary_info_utt.append(token) + + with open(os.path.join(output_dir, f"{utt_obj.utt_id}.ctm"), "w") as f_ctm: + for boundary_info_ in boundary_info_utt: # loop over every token/word/segment + + # skip if t_start = t_end = negative number because we used it as a marker to skip some blank tokens + if not (boundary_info_.t_start < 0 or boundary_info_.t_end < 0): + text = boundary_info_.text + start_time = boundary_info_.t_start + end_time = boundary_info_.t_end + + if ( + ctm_file_config.minimum_timestamp_duration > 0 + and ctm_file_config.minimum_timestamp_duration > end_time - start_time + ): + # make the predicted duration of the token/word/segment longer, growing it outwards equal + # amounts from the predicted center of the token/word/segment + token_mid_point = (start_time + end_time) / 2 + start_time = max(token_mid_point - ctm_file_config.minimum_timestamp_duration / 2, 0) + end_time = min( + token_mid_point + ctm_file_config.minimum_timestamp_duration / 2, audio_file_duration + ) + + if not ( + text == BLANK_TOKEN and ctm_file_config.remove_blank_tokens + ): # don't save blanks if we don't want to + # replace any spaces with so we dont introduce extra space characters to our CTM files + text = text.replace(" ", SPACE_TOKEN) + + f_ctm.write(f"{utt_obj.utt_id} 1 {start_time:.2f} {end_time - start_time:.2f} {text}\n") + + utt_obj.saved_output_files[f"{alignment_level}_level_ctm_filepath"] = os.path.join( + output_dir, f"{utt_obj.utt_id}.ctm" + ) + + return utt_obj diff --git a/tools/nemo_forced_aligner/utils/make_output_files.py b/tools/nemo_forced_aligner/utils/make_output_files.py deleted file mode 100644 index a2d8c80a6580..000000000000 --- a/tools/nemo_forced_aligner/utils/make_output_files.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -from pathlib import Path - -import soundfile as sf -from utils.constants import BLANK_TOKEN, SPACE_TOKEN - - -def _get_utt_id(audio_filepath, audio_filepath_parts_in_utt_id): - fp_parts = Path(audio_filepath).parts[-audio_filepath_parts_in_utt_id:] - utt_id = Path("_".join(fp_parts)).stem - utt_id = utt_id.replace(" ", "-") # replace any spaces in the filepath with dashes - return utt_id - - -def add_t_start_end_to_boundary_info(boundary_info_utt, alignment_utt): - """ - We use the list of alignments to add the timesteps where each token/word/segment is predicted to - start and end. - boundary_info_utt can be any one of the variables referred to as `token_info`, `word_info`, `segment_info` - in other parts of the code. - - e.g. the input boundary info could be - boundary_info_utt = [ - {'text': 'hi', 's_start': 1, 's_end': 3}, - {'text': 'world', 's_start': 7, 's_end': 15}, - {'text': 'hey', 's_start': 19, 's_end': 23}, - ] - - and the alignment could be - alignment_utt = [ 1, 1, 3, 3, 4, 5, 7, 7, 9, 10, 11, 12, 13, 15, 17, 17, 19, 21, 23, 23] - - in which case the output would be: - boundary_info_utt = [ - {'text': 'hi', 's_start': 1, 's_end': 3, 't_start': 0, 't_end': 3}, - {'text': 'world', 's_start': 7, 's_end': 15, 't_start': 6, 't_end': 13}, - {'text': 'hey', 's_start': 19, 's_end': 23, 't_start': 16, 't_end': 19}, - ] - """ - # first remove boundary_info of any items that are not in the alignment - # the only items we expect not to be in the alignment are blanks that the alignment chooses to skip - # we will iterate boundary_info in reverse order for this to make popping the items simple - s_in_alignment = set(alignment_utt) - for boundary_info_pointer in range(len(boundary_info_utt) - 1, -1, -1): - s_in_boundary_info = set( - range( - boundary_info_utt[boundary_info_pointer]["s_start"], - boundary_info_utt[boundary_info_pointer]["s_end"] + 1, - ) - ) - item_not_in_alignment = True - for s_ in s_in_boundary_info: - if s_ in s_in_alignment: - item_not_in_alignment = False - - if item_not_in_alignment: - boundary_info_utt.pop(boundary_info_pointer) - - # now update boundary_info with t_start and t_end - boundary_info_pointer = 0 - for t, s_at_t in enumerate(alignment_utt): - if s_at_t == boundary_info_utt[boundary_info_pointer]["s_start"]: - if "t_start" not in boundary_info_utt[boundary_info_pointer]: - # we have just reached the start of the word/token/segment in the alignment => update t_start - boundary_info_utt[boundary_info_pointer]["t_start"] = t - - if t < len(alignment_utt) - 1: # this if is to avoid accessing an index that is not in the list - if alignment_utt[t + 1] > boundary_info_utt[boundary_info_pointer]["s_end"]: - if "t_end" not in boundary_info_utt[boundary_info_pointer]: - boundary_info_utt[boundary_info_pointer]["t_end"] = t - - boundary_info_pointer += 1 - else: # i.e. t == len(alignment) - 1, i.e. we are a the final element in alignment - # add final t_end if we haven't already - if "t_end" not in boundary_info_utt[boundary_info_pointer]: - boundary_info_utt[boundary_info_pointer]["t_end"] = t - - if boundary_info_pointer == len(boundary_info_utt): - # we have finished populating boundary_info with t_start and t_end, - # but we might have some final remaining elements (blanks) in the alignment which we dont care about - # => break, so as not to cause issues trying to access boundary_info[boundary_info_pointer] - break - - return boundary_info_utt - - -def make_ctm( - boundary_info_batch, - alignments_batch, - manifest_lines_batch, - model, - model_downsample_factor, - output_dir, - remove_blank_tokens_from_ctm, - audio_filepath_parts_in_utt_id, - minimum_timestamp_duration, -): - """ - Function to save CTM files for all the utterances in the incoming batch. - """ - - assert len(boundary_info_batch) == len(alignments_batch) == len(manifest_lines_batch) - # we also assume that utterances are in the same order in boundary_info_batch, alignments_batch - # and manifest_lines_batch - this should be the case unless there is a strange bug upstream in the - # code - - os.makedirs(output_dir, exist_ok=True) - - # the ratio to convert from timesteps (the units of 't_start' and 't_end' in boundary_info_utt) - # to the number of samples ('samples' in the sense of 16000 'samples' per second) - timestep_to_sample_ratio = model.preprocessor.featurizer.hop_length * model_downsample_factor - - for boundary_info_utt, alignment_utt, manifest_line in zip( - boundary_info_batch, alignments_batch, manifest_lines_batch - ): - - boundary_info_utt = add_t_start_end_to_boundary_info(boundary_info_utt, alignment_utt) - - # get utt_id that will be used for saving CTM file as .ctm - utt_id = _get_utt_id(manifest_line['audio_filepath'], audio_filepath_parts_in_utt_id) - - # get audio file duration if we will need it later - if minimum_timestamp_duration > 0: - with sf.SoundFile(manifest_line["audio_filepath"]) as f: - audio_file_duration = f.frames / f.samplerate - - with open(os.path.join(output_dir, f"{utt_id}.ctm"), "w") as f_ctm: - for boundary_info_ in boundary_info_utt: # loop over every token/word/segment - text = boundary_info_["text"] - start_sample = boundary_info_["t_start"] * timestep_to_sample_ratio - end_sample = (boundary_info_["t_end"] + 1) * timestep_to_sample_ratio - 1 - - start_time = start_sample / model.cfg.sample_rate - end_time = end_sample / model.cfg.sample_rate - - if minimum_timestamp_duration > 0 and minimum_timestamp_duration > end_time - start_time: - # make the predicted duration of the token/word/segment longer, growing it outwards equal - # amounts from the predicted center of the token/word/segment - token_mid_point = (start_time + end_time) / 2 - start_time = max(token_mid_point - minimum_timestamp_duration / 2, 0) - end_time = min(token_mid_point + minimum_timestamp_duration / 2, audio_file_duration) - - if not (text == BLANK_TOKEN and remove_blank_tokens_from_ctm): # don't save blanks if we don't want to - # replace any spaces with so we dont introduce extra space characters to our CTM files - text = text.replace(" ", SPACE_TOKEN) - - f_ctm.write(f"{utt_id} 1 {start_time:.2f} {end_time - start_time:.2f} {text}\n") - - return None - - -def make_new_manifest( - output_dir, - original_manifest_filepath, - additional_ctm_grouping_separator, - audio_filepath_parts_in_utt_id, - pred_text_all_lines, -): - """ - Function to save a new manifest with the same info as the original manifest, but also the paths to the - CTM files for each utterance and the "pred_text" if it was used for the alignment. - """ - if pred_text_all_lines: - with open(original_manifest_filepath, 'r') as f: - num_lines_in_manifest = sum(1 for _ in f) - - if not num_lines_in_manifest == len(pred_text_all_lines): - raise RuntimeError( - f"Number of lines in the original manifest ({num_lines_in_manifest}) does not match " - f"the number of pred_texts we have ({len(pred_text_all_lines)}). Something has gone wrong." - ) - - tgt_manifest_name = str(Path(original_manifest_filepath).stem) + "_with_ctm_paths.json" - tgt_manifest_filepath = str(Path(output_dir) / tgt_manifest_name) - - with open(original_manifest_filepath, 'r') as fin, open(tgt_manifest_filepath, 'w') as fout: - for i_line, line in enumerate(fin): - data = json.loads(line) - - utt_id = _get_utt_id(data["audio_filepath"], audio_filepath_parts_in_utt_id) - - data["token_level_ctm_filepath"] = str(Path(output_dir) / "tokens" / f"{utt_id}.ctm") - data["word_level_ctm_filepath"] = str(Path(output_dir) / "words" / f"{utt_id}.ctm") - - if additional_ctm_grouping_separator: - data["additional_segment_level_ctm_filepath"] = str( - Path(output_dir) / "additional_segments" / f"{utt_id}.ctm" - ) - - if pred_text_all_lines: - data['pred_text'] = pred_text_all_lines[i_line] - - new_line = json.dumps(data) - - fout.write(f"{new_line}\n") diff --git a/tools/nemo_forced_aligner/utils/make_output_manifest.py b/tools/nemo_forced_aligner/utils/make_output_manifest.py new file mode 100644 index 000000000000..7ee3fc77f7ab --- /dev/null +++ b/tools/nemo_forced_aligner/utils/make_output_manifest.py @@ -0,0 +1,35 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + + +def write_manifest_out_line( + f_manifest_out, utt_obj, +): + + data = {"audio_filepath": utt_obj.audio_filepath} + if not utt_obj.text is None: + data["text"] = utt_obj.text + + if not utt_obj.pred_text is None: + data["pred_text"] = utt_obj.pred_text + + for key, val in utt_obj.saved_output_files.items(): + data[key] = val + + new_line = json.dumps(data) + f_manifest_out.write(f"{new_line}\n") + + return None diff --git a/tools/nemo_forced_aligner/utils/viterbi_decoding.py b/tools/nemo_forced_aligner/utils/viterbi_decoding.py index bc9a45dda527..78336f800e14 100644 --- a/tools/nemo_forced_aligner/utils/viterbi_decoding.py +++ b/tools/nemo_forced_aligner/utils/viterbi_decoding.py @@ -36,6 +36,7 @@ def viterbi_decoding(log_probs_batch, y_batch, T_batch, U_batch, viterbi_device) Looks like: [[0, 0, 1, 2, 2, 3, 3, ..., ], ..., [0, 1, 2, 2, 2, 3, 4, ....]]. Each list inside alignments_batch is of length T_batch[location of utt in batch]. """ + B, T_max, _ = log_probs_batch.shape U_max = y_batch.shape[1] @@ -50,15 +51,14 @@ def viterbi_decoding(log_probs_batch, y_batch, T_batch, U_batch, viterbi_device) # make log_probs_padded tensor of shape (B, T_max, V +1 ) where all of # log_probs_padded[:,:,-1] is the 'V_NEGATIVE_NUM' log_probs_padded = torch.cat((log_probs_batch, padding_for_log_probs), dim=2) - # make log_probs_reordered tensor of shape (B, T_max, U_max) - # it contains the log_probs for only the tokens that are in the Ground Truth, and in the order - # that they occur - log_probs_reordered = torch.gather(input=log_probs_padded, dim=2, index=y_batch.unsqueeze(1).repeat(1, T_max, 1)) - # initialize tensors of viterbi probabilies and backpointers - v_matrix = V_NEGATIVE_NUM * torch.ones_like(log_probs_reordered) - backpointers = -999 * torch.ones_like(v_matrix) - v_matrix[:, 0, :2] = log_probs_reordered[:, 0, :2] + # initialize v_prev - tensor of previous timestep's viterbi probabilies, of shape (B, U_max) + v_prev = V_NEGATIVE_NUM * torch.ones((B, U_max), device=viterbi_device) + v_prev[:, :2] = torch.gather(input=log_probs_padded[:, 0, :], dim=1, index=y_batch[:, :2]) + + # initialize backpointers_rel - which contains values like 0 to indicate the backpointer is to the same u index, + # 1 to indicate the backpointer pointing to the u-1 index and 2 to indicate the backpointer is pointing to the u-2 index + backpointers_rel = -99 * torch.ones((B, T_max, U_max), dtype=torch.int8, device=viterbi_device) # Make a letter_repetition_mask the same shape as y_batch # the letter_repetition_mask will have 'True' where the token (including blanks) is the same @@ -70,24 +70,23 @@ def viterbi_decoding(log_probs_batch, y_batch, T_batch, U_batch, viterbi_device) letter_repetition_mask[:, :2] = 1 # make sure dont apply mask to first 2 tokens letter_repetition_mask = letter_repetition_mask == 0 - # bp_absolute_template is a tensor we will need during the Viterbi decoding to convert our argmaxes from indices between 0 and 2, - # to indices in the range (0, U_max-1) indicating from which token the mostly path up to that point came from. - # it is a tensor of shape (B, U_max) that looks like - # bp_absolute_template = [ - # [0, 1, 2, ...,, U_max] - # [0, 1, 2, ...,, U_max] - # [0, 1, 2, ...,, U_max] - # ... rows repeated so there are B number of rows in total - # ] - bp_absolute_template = torch.arange(U_max, device=viterbi_device).unsqueeze(0).repeat(B, 1) - for t in range(1, T_max): # e_current is a tensor of shape (B, U_max) of the log probs of every possible token at the current timestep - e_current = log_probs_reordered[:, t, :] + e_current = torch.gather(input=log_probs_padded[:, t, :], dim=1, index=y_batch) + + # apply a mask to e_current to cope with the fact that we do not keep the whole v_matrix and continue + # calculating viterbi probabilities during some 'padding' timesteps + t_exceeded_T_batch = t >= T_batch - # v_prev is a tensor of shape (B, U_max) of the viterbi probabilities 1 timestep back and in the same token position - v_prev = v_matrix[:, t - 1, :] + U_can_be_final = torch.logical_or( + torch.arange(0, U_max, device=viterbi_device).unsqueeze(0) == (U_batch.unsqueeze(1) - 0), + torch.arange(0, U_max, device=viterbi_device).unsqueeze(0) == (U_batch.unsqueeze(1) - 1), + ) + + mask = torch.logical_not(torch.logical_and(t_exceeded_T_batch.unsqueeze(1), U_can_be_final,)).long() + + e_current = e_current * mask # v_prev_shifted is a tensor of shape (B, U_max) of the viterbi probabilities 1 timestep back and 1 token position back v_prev_shifted = torch.roll(v_prev, shifts=1, dims=1) @@ -111,26 +110,27 @@ def viterbi_decoding(log_probs_batch, y_batch, T_batch, U_batch, viterbi_device) # candidates_v_current are our candidate viterbi probabilities for every token position, from which # we will pick the max and record the argmax candidates_v_current = v_prev_dup + e_current.unsqueeze(2) - v_current, bp_relative = torch.max(candidates_v_current, dim=2) - - # convert our argmaxes from indices between 0 and 2, to indices in the range (0, U_max-1) indicating - # from which token the mostly path up to that point came from - bp_absolute = bp_absolute_template - bp_relative + # we straight away save results in v_prev instead of v_current, so that the variable v_prev will be ready for the + # next iteration of the for-loop + v_prev, bp_relative = torch.max(candidates_v_current, dim=2) - # update our tensors containing all the viterbi probabilites and backpointers - v_matrix[:, t, :] = v_current - backpointers[:, t, :] = bp_absolute + backpointers_rel[:, t, :] = bp_relative - # trace backpointers TODO: parallelize over batch_size + # trace backpointers alignments_batch = [] for b in range(B): T_b = int(T_batch[b]) U_b = int(U_batch[b]) - final_state = int(torch.argmax(v_matrix[b, T_b - 1, U_b - 2 : U_b])) + U_b - 2 - alignment_b = [final_state] - for t in range(T_b - 1, 0, -1): - alignment_b.insert(0, int(backpointers[b, t, alignment_b[0]])) + if U_b == 1: # i.e. we put only a blank token in the reference text because the reference text is empty + current_u = 0 # set initial u to 0 and let the rest of the code block run as usual + else: + current_u = int(torch.argmax(v_prev[b, U_b - 2 : U_b])) + U_b - 2 + alignment_b = [current_u] + for t in range(T_max - 1, 0, -1): + current_u = current_u - int(backpointers_rel[b, t, current_u]) + alignment_b.insert(0, current_u) + alignment_b = alignment_b[:T_b] alignments_batch.append(alignment_b) return alignments_batch From 2db352a67c38425062cea936d0623e405ae07bb1 Mon Sep 17 00:00:00 2001 From: Matvei Novikov Date: Sat, 10 Jun 2023 04:20:20 +0400 Subject: [PATCH 036/123] Added rouge monitoring support for T5 (#6737) * Added rouge monitoring support for t5 Signed-off-by: Matvei Novikov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Matvei Novikov Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../models/language_modeling/megatron_finetune_model.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index 4ed71756e60e..32024deb19b4 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -49,7 +49,6 @@ HAVE_MEGATRON_CORE = False - __all__ = ['MegatronT5FinetuneModel'] @@ -204,7 +203,7 @@ def on_train_epoch_start(self) -> None: return super().on_train_epoch_start() def cast_for_metric(self, pred, label, metric_name, class_labels=None, labels_are_strings=False): - if metric_name == 'exact_string_match': + if metric_name == 'exact_string_match' or 'rouge': return pred, label pred = pred.replace(' ', '') label = label.replace(' ', '') @@ -445,6 +444,8 @@ def inference_epoch_end(self, outputs, mode, data_cfg): self.val_metric[dataloader_idx] if mode == 'validation' else self.test_metric[dataloader_idx] ) metric = metric_object.compute() + if metric_name == 'rouge': + metric = metric['rouge1_fmeasure'] # Handle logging of GLUE/XNLI separately here. XNLI has a separate metric per language. if isinstance(metric, dict): # GLUE case: @@ -458,7 +459,8 @@ def inference_epoch_end(self, outputs, mode, data_cfg): if k != 'acc' and 'total' not in k: self.log(metric_log_key + f'_{k}', v, batch_size=1) logging.info(f"{mode} {metric_name} lang {k} : {v}") - metric = metric['acc'] + if metric_name != 'rouge': + metric = metric['acc'] else: self.log(metric_log_key, metric, batch_size=1) logging.info(f"{metric_log_key}: {metric}") From a87702a522387da0aac62dc1f90a88a8e0bfc7cc Mon Sep 17 00:00:00 2001 From: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com> Date: Mon, 12 Jun 2023 23:15:00 +0800 Subject: [PATCH 037/123] GPT extrapolatable position embedding (xpos/sandwich/alibi/kerple) and Flash Attention (#6666) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * move to nvidia megatron repo (#6465) (#6475) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Signed-off-by: hsiehjackson * Megatron KERPLE positional embeddings (#6478) (#6480) * [TTS] FastPitch adapter fine-tune and conditional layer normalization (#6416) [TTS] FastPitch adapter fine-tune and conditional layer normalization (#6416) --------- * [TTS] whitelist broken path fix. (#6412) * [TTS] whitelist broken path fix. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- * [TTS] FastPitch speaker encoder (#6417) * Add initial codes * Remove wemb * Fix import * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Restore aligner loss * Add ConditionalInput * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix error and support pre-trained config * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Follow comments * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Rename config * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Change copyright and random weight test * Add initial codes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix import error * Add initial codes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix dataset error * Remove reference speaker embedding * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove SV encoder * Follow comments * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix length type * Fix append * Move error msg * Add look-up into speaker encoder * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add valueerror msg * Move lookup * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unused * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix error * Rebase and Fix error * Fix spk encoder * Rename n_speakers * Follow comments * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix n_speakers None error --------- * Sharded manifests for tarred datasets (#6395) * testing sharded manifests * compatibility * proper fixes * adding flag tot convert_to_tarred_audio_dataset * shard_manifests conf param * propagating the shard_manifests param * propagating the shard_manifests param * distributed checks * typo * typo * fixes * fixes * fixes * fixes * fixes * fixes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixes based on PR comments and tests * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixes to convert_to_tarred_audio_dataset.py * reversing manifest shards flag * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tests * excluding manifests from webdataset url expansion * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * expand manifest paths before attempting to cache from datastore * explicit use of UTF-8 for manifest i/o --------- * Update wfst_text_normalization.rst (#6374) Add Hungarian (incoming in NeMo-text-processing) * Support Swiglu in TP PP Conversion (#6437) (#6451) * Support Swiglu in TP PP Conversion * Guard activation * Guard activation --------- * Update NeMo_TTS_Primer.ipynb (#6436) * Update NeMo_TTS_Primer.ipynb Changed a mistake in line 782. Instead of frequency band (ie. pitch) we should write frequency bin. Note that frequency bins in FFT are not related to pitch. * Update NeMo_TTS_Primer.ipynb Corrected the description of spectrogram and mel spectrogram calculations in lines 782 & 783 and added a fourth point to the description and added a reference for more mathematical details at the end of this point. --------- * add rampup batch size support for Megatron GPT (#6424) * added rampup batch size support * added tests for rampup batch size * fixed the typos * added assertions * changed assertion rules * deleted unused imports * changed tests for rampup batch size * updated rampup batch size tests * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed styling * rampup batch size tests changes --------- * Meagtron encoder decoder fix for empty validation outputs (#6459) (#6461) * 1. Meagtron encoder decoder fix for empty validation outputs. * 1. Debugging. --------- * Code-Switching dataset creation - upgrading to aggregate tokenizer manifest format (#6448) * added functionality to create agg tokenizer compatible manifest for CS, flag to use this mode by default * updated README with the new agg_tokenizer_manifest flag * fixed typo in scripts/speech_recognition/code_switching/README.md * changed agg_tokenizer_manifest to is_lid_manifest --------- * Added/updated new Conformer configs (#6426) (#6467) * Update script for ngram rnnt and hat beam search decoding (#6370) * add rnnt ngram beamsearch script * add return encoding embedding option * update script * add rnnt and hat ngram decoding script * add some parameters * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add return_encoder_embeddings parameter to RNNTDecodingConfig * replace return_encoder_embeddings parameter * generalization of scipt behavior * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove return_encoder_embeddings parameter * remove return_encoder_embeddings parameter * add manual encoder_embeddings calculation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix beam_width value to 8 * fix rescoring description --------- * BERT pre-training mp fork to spawn (#6442) (#6454) * change bert fork to spawn * num_workers=0 fix --------- * fix replace_bos_with_pad not found (#6443) (#6450) * reduce workers on NMT CI (#6472) (#6474) * 1. Added KERPLE positional embeddings to encoder-decoder. * 1. Added a missing file. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * 1. Fixing commits. * 1. Debugging. * 1. Debugging. * 1. Debugging. * 1. Debugging. --------- Signed-off-by: hsiehjackson Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Dima Rekesh Signed-off-by: Jim O’Regan Signed-off-by: smajumdar Signed-off-by: Mostafa Ghorbandoost Signed-off-by: Dmytro Pykhtar Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Signed-off-by: Micha Livne Signed-off-by: Kunal Dhawan Signed-off-by: andrusenkoau Signed-off-by: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com> Signed-off-by: Abhinav Khattar Co-authored-by: Micha Livne Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Dima Rekesh Co-authored-by: Jim O’Regan Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Somshubra Majumdar Co-authored-by: Mostafa Ghorbandoost Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar Co-authored-by: Eric Harper Co-authored-by: Micha Livne Co-authored-by: Kunal Dhawan Co-authored-by: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com> Co-authored-by: Abhinav Khattar Signed-off-by: hsiehjackson * Fix an invalid link in get_data.py of ljspeech (#6456) Usage of the link in line 63 leads to downloading a html file not a tsv file, so we need to change it to a raw link. Signed-off-by: Mostafa Ghorbandoost Signed-off-by: hsiehjackson * 1. Added external index sample. (#6462) (#6483) Signed-off-by: Micha Livne Co-authored-by: Micha Livne Signed-off-by: hsiehjackson * Update README to add core installation (#6488) (#6489) * update README for megatron-core * fix --------- Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Signed-off-by: hsiehjackson * Fix cache aware hybrid bugs (#6466) (#6484) Signed-off-by: hsiehjackson * Fix typos (#6494) (#6495) Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar Signed-off-by: hsiehjackson * Add disclaimer about dataset for ASR (#6496) Signed-off-by: smajumdar Signed-off-by: hsiehjackson * fix (#6502) datastore_path_to_webdataset_url(p) if is_datastore_path(p) and is_tarred_path(p) else p NameError: name 'is_tarred_path' is not defined Co-authored-by: George Signed-off-by: hsiehjackson * fix broken links r1.18.0 (#6501) (#6504) * fix broken links * fix broken links --------- Signed-off-by: Evelina Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> Signed-off-by: hsiehjackson * [TTS] Create functions for TTS preprocessing without dataloader (#6317) * [TTS] Create functions for TTS preprocessing without dataloader Signed-off-by: Ryan Signed-off-by: hsiehjackson * Cache aware streaming nfa (#6209) * add cache aware streaming to nemo aligner Signed-off-by: Slyne Deng Signed-off-by: hsiehjackson * [BugFix] Force _get_batch_preds() to keep logits in decoder timestamps generator (#6499) * [BugFix] _get_batch_preds() is forced to keep logits in decoder timestamps generators Signed-off-by: Taejin Park * Ingnore keep_logits boolean in FrameASRBatchLogits Signed-off-by: Taejin Park --------- Signed-off-by: Taejin Park Co-authored-by: Jagadeesh Balam <4916480+jbalam-nv@users.noreply.github.com> Signed-off-by: hsiehjackson * [TTS] Fix FastPitch energy code (#6511) Signed-off-by: Ryan Signed-off-by: hsiehjackson * fix custom forward_torch_softmax (#6512) (#6517) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Signed-off-by: hsiehjackson * [TTS] fixed broken path. (#6514) (#6518) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: hsiehjackson * Fix normalization of impulse response in ImpulsePerturbation (#6505) Signed-off-by: Ante Jukić Signed-off-by: hsiehjackson * Add interleaved pp support (#6498) * Add support for Virtual Pipeline Parallel conversion Signed-off-by: smajumdar * Add support for Virtual Pipeline Parallel conversion Signed-off-by: smajumdar * Switch to megatron core Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: smajumdar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Fix typos (#6523) * Fix typos Signed-off-by: smajumdar * Fix typos Signed-off-by: smajumdar --------- Signed-off-by: smajumdar Signed-off-by: hsiehjackson * New noise_norm perturbation based on Riva work (#6445) * Initial commit for new noise_norm perturbation Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Minor fix to random seed in perturb Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updated code to reflect feedback Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updates for feedback given by code reviewers Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updates in response to PR feedback Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Added comment about ref_mic being None Signed-off-by: Daniel Egert * Updated perturb to use inspect module Signed-off-by: Daniel Egert --------- Signed-off-by: Daniel Egert Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * [TTS] Add script for computing feature stats (#6508) * [TTS] Add script for computing feature stats Signed-off-by: Ryan * [TTS] Add overwrite config Signed-off-by: Ryan --------- Signed-off-by: Ryan Signed-off-by: hsiehjackson * Add Frame-VAD model and datasets (#6441) * add model, dataset, necessary utils and tests Signed-off-by: stevehuang52 * fix tarred data Signed-off-by: stevehuang52 * fix typo Signed-off-by: stevehuang52 * update docstring Signed-off-by: stevehuang52 * update doc Signed-off-by: stevehuang52 * update doc Signed-off-by: stevehuang52 * update pretrained model info Signed-off-by: stevehuang52 --------- Signed-off-by: stevehuang52 Signed-off-by: hsiehjackson * Support dynamic length batches with GPT SFT (#6510) * Support synamic length with GPT SFT Signed-off-by: Abhinav Khattar * make branch functional Signed-off-by: Abhinav Khattar --------- Signed-off-by: Abhinav Khattar Signed-off-by: hsiehjackson * added back the fast emit section to the configs. (#6540) (#6542) * added back the fast emit section to the configs. * added back the fast emit section to the configs. --------- Signed-off-by: Vahid Co-authored-by: Vahid Noroozi Signed-off-by: hsiehjackson * removing unnessary avoid_bfloat16_autocast_context (#6481) Signed-off-by: Dima Rekesh Signed-off-by: hsiehjackson * FC models in menu (#6473) * FC models in menu Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Dima Rekesh Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * [TTS] Add tutorials for FastPitch TTS speaker adaptation with adapters (#6431) * Add tts adapter tutorial Signed-off-by: hsiehjackson * Update main tutorial Signed-off-by: hsiehjackson * Add tts adapter tutorial Signed-off-by: hsiehjackson * Update main tutorial Signed-off-by: hsiehjackson * Update tutorial Signed-off-by: hsiehjackson * Follow comments Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Follow comments Signed-off-by: hsiehjackson * Fix load .nemo error Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Support multi-speaker fine-tune Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Follow comments Signed-off-by: hsiehjackson * Use .nemo Signed-off-by: hsiehjackson * Follow Comments Signed-off-by: hsiehjackson * Fix bug Signed-off-by: hsiehjackson * Fix bug Signed-off-by: hsiehjackson * Fix bug Signed-off-by: hsiehjackson * Add precomputed speaker emb Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix space Signed-off-by: hsiehjackson * Remove repeated argument Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * optional batch size Signed-off-by: hsiehjackson * Fix comments in notebook Signed-off-by: hsiehjackson --------- Signed-off-by: hsiehjackson Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * [TTS] Create initial TTS dataset feature processors (#6507) Signed-off-by: Ryan Signed-off-by: hsiehjackson * fix (#6529) (#6546) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Signed-off-by: hsiehjackson * Add FastConformer Hybrid ASR models for EN, ES, IT, DE, PL, HR, UA, BY (#6549) (#6553) * Added fastconfomer hybrid asr models for en, es, it, de, pl, hr, ua, by * updated ASR docs with the fastconformer hybrid checkpoints * added the fastconformer RNNT and CTC models --------- Signed-off-by: KunalDhawan Co-authored-by: Kunal Dhawan Signed-off-by: hsiehjackson * Add scores for FastConformer models (#6557) (#6558) Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar Signed-off-by: hsiehjackson * Fix fp16 (#6543) (#6544) Signed-off-by: MaximumEntropy Co-authored-by: Sandeep Subramanian Signed-off-by: hsiehjackson * Patch transcribe and support offline transcribe for hybrid model (#6550) (#6559) Signed-off-by: fayejf Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> Signed-off-by: hsiehjackson * Fix notebook bad json (#6561) Signed-off-by: smajumdar Signed-off-by: hsiehjackson * Change Megatron Enc Dec model to use persistent_workers (#6548) (#6552) * persistent workers * fix --------- Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Co-authored-by: Eric Harper Signed-off-by: hsiehjackson * Make KenLM with PC for AggregateTokenizer and merge it (#6081) * do_lowercase, rm_punctuation Signed-off-by: Nikolay Karpov * support beam_strategy = beam Signed-off-by: Nikolay Karpov * black Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix config and^Cunctuation capitalization Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rm math Signed-off-by: Nikolay Karpov * update kenlm Signed-off-by: Nikolay Karpov * black Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add opengrm Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * mv install_beamsearch_decoders Signed-off-by: Nikolay Karpov * punctuation_to_preserve Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Only tikenizer opion Signed-off-by: Nikolay Karpov * Black Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * DEFAULT_TOKEN_OFFSET Signed-off-by: Nikolay Karpov * aggregate_tokenizer Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * install kenlm with more than 5gram Signed-off-by: Nikolay Karpov * install_beamsearch_decoders Signed-off-by: Nikolay Karpov * ngram_bin_path kenlm_bin_path Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * black Signed-off-by: Nikolay Karpov * fix greedy PC bug Signed-off-by: Nikolay Karpov * move global params Signed-off-by: Nikolay Karpov * fix description and perplexity Signed-off-by: Nikolay Karpov * fix description Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * NEMO_PATH Signed-off-by: Nikolay Karpov * nemo:23.01 Signed-off-by: Nikolay Karpov * License Signed-off-by: Nikolay Karpov * description Signed-off-by: Nikolay Karpov * isinstance Signed-off-by: Nikolay Karpov * refactor kenlm stdin Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * black Signed-off-by: Nikolay Karpov * add cmd arg Signed-off-by: Nikolay Karpov * use new iter_files Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * EncDecHybridRNNTCTCModel Signed-off-by: Nikolay Karpov * punctuation Signed-off-by: Nikolay Karpov * train_kenlm args Signed-off-by: Nikolay Karpov * add docstrings Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add ngram_merge docs Signed-off-by: Nikolay Karpov * ngram_prune Signed-off-by: Nikolay Karpov * rename to ngram_merge Signed-off-by: Nikolay Karpov * rename to ngram Signed-off-by: Nikolay Karpov * add comments Signed-off-by: Nikolay Karpov * Ngram Signed-off-by: Nikolay Karpov * nemo_model_file Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * install_opengrm_ngram Signed-off-by: Nikolay Karpov * install opengrm Signed-off-by: Nikolay Karpov * rename to install_opengrm.sh Signed-off-by: Nikolay Karpov * rm extra import Signed-off-by: Nikolay Karpov * train_paths Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * text_processing Signed-off-by: Nikolay Karpov * fix ngram_bin_path Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * DECODERS_PATH Signed-off-by: Nikolay Karpov * farcompile Signed-off-by: Nikolay Karpov * rm text processing Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * text_processing Signed-off-by: Nikolay Karpov * AggregateTokenizer.DummyTokenizer Signed-off-by: Nikolay Karpov * comments Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * TextProcessingConfig Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * typo Signed-off-by: Nikolay Karpov * doc Signed-off-by: Nikolay Karpov * types Signed-off-by: Nikolay Karpov * nemo_model_file Signed-off-by: Nikolay Karpov * rm assert Signed-off-by: Nikolay Karpov * import kenlm_utils Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * return None Signed-off-by: Nikolay Karpov * Copyright Signed-off-by: Nikolay Karpov * 2022 Signed-off-by: Nikolay Karpov * 2023 Signed-off-by: Nikolay Karpov --------- Signed-off-by: Nikolay Karpov Signed-off-by: Nikolay Karpov Co-authored-by: Nikolay Karpov Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * fix for running on 1 GPU. Signed-off-by: hsiehjackson * temp rtd fix (#6568) (#6569) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Signed-off-by: hsiehjackson * [TTS] Add script for mapping speaker names to indices (#6509) Signed-off-by: Ryan Signed-off-by: hsiehjackson * whitespace (#6574) Signed-off-by: Nikolay Karpov Signed-off-by: hsiehjackson * Update manifest.py for speedup (#6565) (#6573) * Update manifest.py Re-order the checks for faster processing audio filepaths that are already absolute paths * Update manifest.py --------- Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: Vahid Noroozi Signed-off-by: hsiehjackson * More streaming conformer export fixes (#6567) (#6578) Signed-off-by: Greg Clark Co-authored-by: Greg Clark Co-authored-by: Vahid Noroozi Signed-off-by: hsiehjackson * user selected max_seq_len should be less than model's max_seq_len (#6333) (#6386) * user selection should not break model max limit * eval max seq length --------- Signed-off-by: arendu Signed-off-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com> Co-authored-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com> Co-authored-by: Sandeep Subramanian Co-authored-by: Eric Harper Signed-off-by: hsiehjackson * Framework for PEFT via mixins (#6391) * init commit ptuning via mixin Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates Signed-off-by: arendu * gpt ptuning places virtual tokens on the left only Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * encoder input modified when pre_process is true Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * optimizer group and state dict updates Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * adapter ptuning working for pp>1 Signed-off-by: arendu * adapter defaults Signed-off-by: arendu * adapter ptuining config defaults Signed-off-by: arendu * training works Signed-off-by: arendu * loading and saving adapter only params during training Signed-off-by: arendu * added checks and comments Signed-off-by: arendu * clean up Signed-off-by: arendu * checks for grad is None before calling all_reduce Signed-off-by: arendu * load adapter .nemo file working Signed-off-by: arendu * resume training for adapters Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * peft tuning Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor Signed-off-by: arendu * file not needed Signed-off-by: arendu * undo prompt learning dataset changes Signed-off-by: arendu * undo updates to gpt prompt learning model Signed-off-by: arendu * naming updates Signed-off-by: arendu * decoding Signed-off-by: arendu * predict_step in gpt_sft_model Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removed inference from tuning config Signed-off-by: arendu * no test in peft training Signed-off-by: arendu * answer only loss and correct defaults for val_loss Signed-off-by: arendu * hybrid adapters and ptuning Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * eval working.. Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * prepending tokens for ptuning Signed-off-by: arendu * cleaned up eval config Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * clean up Signed-off-by: arendu * update Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * default prompt template Signed-off-by: arendu * Lora added Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Support synamic length with GPT SFT Signed-off-by: Abhinav Khattar * make branch functional Signed-off-by: Abhinav Khattar * defaults to max_pad_length=False in GPT SFT dataset Signed-off-by: arendu * adapter parallel_adapters to support Lora Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added early stopping by default Signed-off-by: arendu * eval script for peft and eval config. bug fixes in predict step and added out_features to t5 adapter config Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * docs Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * better defaults Signed-off-by: arendu * updates Signed-off-by: arendu * update Signed-off-by: arendu * docs Signed-off-by: arendu --------- Signed-off-by: arendu Signed-off-by: Abhinav Khattar Signed-off-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Abhinav Khattar Signed-off-by: hsiehjackson * cache and reuse inputs (#6422) (#6452) Co-authored-by: Sangkug Lym Co-authored-by: Eric Harper Signed-off-by: hsiehjackson * Add patches for Virtual Parallel conversion (#6589) * Add patches for Virtual Parllel conversion Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: smajumdar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Pass `.scale` instead of scaler object to core (#6551) * pass .scale instead of scaler object to core (#6545) Signed-off-by: Abhinav Khattar Co-authored-by: Eric Harper * Update megatron_gpt_model.py Signed-off-by: Abhinav Khattar * scale changes for main Signed-off-by: Abhinav Khattar --------- Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Co-authored-by: Eric Harper Signed-off-by: hsiehjackson * Documentation for ASR-TTS models (#6594) (#6595) * Add docs about hybrid ASR-TTS models * Add docs about text-only datasets * Add docs about ASR-TTS checkpoints * Add docs about ASR-TTS configs and training * Clean up * ASR-TTS docs: add to api, fix imports * Clean up * Wrap optional import * Revert general ASR import --------- Signed-off-by: Vladimir Bataev Co-authored-by: Vladimir Bataev Signed-off-by: hsiehjackson * [TTS] Fix aligner nan loss in fp32 (#6435) * Fix nan loss in fp32 Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: hsiehjackson Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Update SDP docs (#6485) (#6596) * add info about SDP e.g. processor classes in docs * add link to SDP docs in README * address code review comments and add SDP overview diagram * Fix spelling typo --------- Signed-off-by: Elena Rastorgueva Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> Signed-off-by: hsiehjackson * Bug/typo fixes (#6599) Signed-off-by: Igor Gitman Signed-off-by: hsiehjackson * Manual garbage collection with an interval (#6469) (#6482) * Manual garbage collection with an interval * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * use trainer.global_step for tracking the interval of GC --------- Signed-off-by: Sangkug Lym Co-authored-by: Sangkug Lym Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper Signed-off-by: hsiehjackson * Make tensor split contiguous (#6580) (#6593) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Signed-off-by: hsiehjackson * [ASR] Fix for old models in change_attention_model (#6608) * fixes Signed-off-by: sam1373 * done already Signed-off-by: sam1373 --------- Signed-off-by: sam1373 Signed-off-by: hsiehjackson * Update manifest.py to use os.path for get_full_path (#6598) * Update manifest.py to use os.path for get_full_path Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update manifest.py to get rid of pathlib Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update manifest.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * Update manifest.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Vahid Noroozi Signed-off-by: hsiehjackson * Cherry pick commits in #6601 to main (#6611) * fix write Signed-off-by: fayejf * decoding ctc Signed-off-by: fayejf * temp set rnnt decoding return_best_hypothesis to true Signed-off-by: fayejf * add wer cal back to transcribe_speech as requested Signed-off-by: fayejf * add wer cal back to speech_to_text_buffered_infer_rnnt as requested Signed-off-by: fayejf * add wer cal back to speech_to_text_buffered_infer_ctc as requested Signed-off-by: fayejf * style fix Signed-off-by: fayejf * reflect change in asr_evaluator Signed-off-by: fayejf * reflect som and vahid comment Signed-off-by: fayejf * remove return_best_hy=true in transcribe_speech Signed-off-by: fayejf * no text skip Signed-off-by: fayejf * revert partial Signed-off-by: fayejf --------- Signed-off-by: fayejf Signed-off-by: hsiehjackson * Create dummy iters to satisy len checks (#6600) (#6603) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Co-authored-by: Eric Harper Signed-off-by: hsiehjackson * add GPT eval mode fix for interleaved to main (#6610) Signed-off-by: Abhinav Khattar Signed-off-by: hsiehjackson * Fix batch size reconf for T5 FT for multi-validation (#6582) (#6588) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Co-authored-by: Eric Harper Signed-off-by: hsiehjackson * Not doing CastToFloat by default (#6524) (#6563) * Not doing CastToFloat by default * Added docustring * Dummy commit --------- Signed-off-by: Boris Fomitchev Co-authored-by: Boris Fomitchev Co-authored-by: Eric Harper Signed-off-by: hsiehjackson * Turn autocast off when precision is fp32 (#6576) * Turn autocast off when precision is fp32 (#6554) * Turn autocast off when precision is fp32 Signed-off-by: Abhinav Khattar * address review Signed-off-by: Abhinav Khattar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixes Signed-off-by: Abhinav Khattar * merge Signed-off-by: Abhinav Khattar --------- Signed-off-by: Abhinav Khattar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper * correct auto-merge Signed-off-by: Abhinav Khattar * correct auto-merge Signed-off-by: Abhinav Khattar * add to GPT SFT Signed-off-by: Abhinav Khattar --------- Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper Signed-off-by: hsiehjackson * update core commit hash in readme (#6622) (#6623) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Signed-off-by: hsiehjackson * add hat image to docs (#6619) (#6621) Signed-off-by: andrusenkoau Co-authored-by: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com> Signed-off-by: hsiehjackson * Allow indices exchange via distributed (#6618) (#6624) Signed-off-by: Mikołaj Błaż Co-authored-by: mikolajblaz Signed-off-by: hsiehjackson * Offline and streaming inference support for hybrid model (#6570) * streaming buffered for hybrid + ctc Signed-off-by: fayejf * change default model_stride in eval.yaml Signed-off-by: fayejf * add fc model_stride Signed-off-by: fayejf * small fix Signed-off-by: fayejf * check whether model and decoding match Signed-off-by: fayejf * small fix Signed-off-by: fayejf * streaming buffered for hybrid + rnnt Signed-off-by: fayejf * style fix Signed-off-by: fayejf * fix yaml Signed-off-by: fayejf * reflect comment wip Signed-off-by: fayejf * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: fayejf * refactor and verified Signed-off-by: fayejf * add get_full_path to buffered Signed-off-by: fayejf * small fix Signed-off-by: fayejf * add RNNTDecodingConfig Signed-off-by: fayejf * model name & instruction of changing decoding Signed-off-by: fayejf --------- Signed-off-by: fayejf Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Patch decoding for PC models (#6630) (#6631) * Patch decoding logic for PC models * Patch decoding logic for PC models --------- Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar Signed-off-by: hsiehjackson * Fix wer.py where 'errors' variable was not set (#6633) (#6634) Fix wer.py where 'errors' variable was not set when both reference and hypothesis are empty strings Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Signed-off-by: hsiehjackson * Restore GPT support for interleaved pipeline parallelism (#6528) (#6613) * Restore logic for data-parallel communication with pipeline parallelism in GPT * Support dynamic attention masks in GPT * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Debug typos * Debug data iterator caching with interleaved pipeline parallelism Each model chunk accesses the data iterator multiple times, so we need to cache multiple samples. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update Megatron-LM commit * Distinguish between list of data iterators and data iterator that is a list * Create dummy iters to satisy len checks * Kludge while waiting for Megatron-LM update * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * set transformers offline to avoid rate limiting --------- Signed-off-by: Tim Moon Signed-off-by: Eric Harper Signed-off-by: Abhinav Khattar Signed-off-by: ericharper Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper Co-authored-by: Abhinav Khattar Signed-off-by: hsiehjackson * Add FA Signed-off-by: hsiehjackson * Fix XPOS Signed-off-by: hsiehjackson * Add warning Signed-off-by: hsiehjackson * Fix bugs Signed-off-by: hsiehjackson * Fix attention Signed-off-by: hsiehjackson * Fix comment Signed-off-by: hsiehjackson * Fix cast dtype Signed-off-by: hsiehjackson * Undo xpos Signed-off-by: hsiehjackson * bugfix (#6636) Signed-off-by: fayejf Signed-off-by: hsiehjackson * Disable interctc tests (#6638) Signed-off-by: Igor Gitman Signed-off-by: hsiehjackson * Add megatron_core to requirements (#6639) (#6640) * add megatron_core to requirements * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: ericharper Co-authored-by: Eric Harper Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Remove from jenkins (#6642) * Remove from jenkins (#6641) * add megatron_core to requirements Signed-off-by: ericharper * remove from jenkins Signed-off-by: ericharper --------- Signed-off-by: ericharper * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove dup Signed-off-by: ericharper --------- Signed-off-by: ericharper Co-authored-by: Eric Harper Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * sft model can use this script for eval (#6637) * sft model can use this script for eval Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * please fix me Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor Signed-off-by: arendu --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * [TTS] Fix TTS audio preprocessing bugs (#6628) Signed-off-by: Ryan Signed-off-by: hsiehjackson * Move black parameters to pyproject.toml (#6647) Signed-off-by: Vladimir Bataev Signed-off-by: hsiehjackson * ASR-TTS Models: Support hybrid RNNT-CTC, improve docs. (#6620) * ASR-TTS: support hybrid RNNT-CTC models * Do not warn on optional import * Explain adding options to config * Fix import guard docs * Add docs for ConcatDataset * Add explanation for sampling parameters * Initial docs for the enhancer model * Fix use_start_end_token parameter usage --------- Signed-off-by: Vladimir Bataev Signed-off-by: hsiehjackson * fix conversion and eval (#6648) * fix conversion and eval Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Confidence ensembles implementation (#6614) * Working version to train conf model + save ensemble class Signed-off-by: Igor Gitman * Working version Signed-off-by: Igor Gitman * Remove copy of transcribe_speech.py Signed-off-by: Igor Gitman * Move models parameter to config Signed-off-by: Igor Gitman * Add explicit parameters to transcribe Signed-off-by: Igor Gitman * Small cleanups Signed-off-by: Igor Gitman * Add temperature and integration tests Signed-off-by: Igor Gitman * Add more tests Signed-off-by: Igor Gitman * Add pc removal config Signed-off-by: Igor Gitman * Cleanup Signed-off-by: Igor Gitman * Fix typo Signed-off-by: Igor Gitman * Address review comments Signed-off-by: Igor Gitman --------- Signed-off-by: Igor Gitman Signed-off-by: hsiehjackson * Patch memory used for NeMo Megatron models (#6615) * Patch memory used for NeMo Megatron models Signed-off-by: smajumdar * Cleanup the dtype of embeddings Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Refactor util function for parsing precision Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Refactor util function for parsing precision Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Try patch for Megatron O2 Signed-off-by: smajumdar * Refactor to incorporate megatron amp 02 state Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Refactor to incorporate megatron amp 02 state Signed-off-by: smajumdar * Correct indent Signed-off-by: smajumdar * Correct utils import Signed-off-by: smajumdar --------- Signed-off-by: smajumdar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * handle artifacts when path is dir (#6658) Signed-off-by: arendu Signed-off-by: hsiehjackson * remove upgrading setuptools in reinstall.sh (#6659) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> Signed-off-by: hsiehjackson * merge lora weights into base model (#6597) * merge lora weights into base model Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * typo fix Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor update Signed-off-by: arendu * update copyright Signed-off-by: arendu * eval needs to know the PEFT class Signed-off-by: arendu * add target class in training script so that we can use it in eval Signed-off-by: arendu * update Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update to work for tp1 Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * set restore model path Signed-off-by: arendu * peft can be none Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated merge script so that eval works easily Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * eval with peft or sft model Signed-off-by: arendu * keep sentences in jsonl format Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * convert sft using correct classpath Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated to force sft yaml to have the correct target Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated docs Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix conversion and eval Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * upgrade to 23.04 (#6660) Signed-off-by: ericharper Signed-off-by: hsiehjackson * Merge r1.18.0 bugfixes and doc updates to main (#6655) * update branch Signed-off-by: ericharper * Remove from jenkins (#6641) * add megatron_core to requirements Signed-off-by: ericharper * remove from jenkins Signed-off-by: ericharper --------- Signed-off-by: ericharper * remove dup Signed-off-by: ericharper * update branch Signed-off-by: ericharper * [TTS] reformat NeMo versions in the tts logging messages to avoid batch process them when upgrading NeMo versions. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --------- Signed-off-by: ericharper Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: hsiehjackson * Confidence ensembles: fix issues and add tuning functionality (#6657) * Implement compute confidence to properly handle blanks Signed-off-by: Igor Gitman * Implement proper confidence for transducers Signed-off-by: Igor Gitman * Implement tuning logic Signed-off-by: Igor Gitman * Add tests for confidence tuning Signed-off-by: Igor Gitman * Remove unused imports Signed-off-by: Igor Gitman * Add types/docs Signed-off-by: Igor Gitman * Add comment about the main conf compute loop Signed-off-by: Igor Gitman --------- Signed-off-by: Igor Gitman Signed-off-by: hsiehjackson * [TTS] Implement new TextToSpeech dataset (#6575) * [TTS] Implement new TextToSpeech dataset Signed-off-by: Ryan * [TTS] Add unit tests Signed-off-by: Ryan * [TTS] Fix defaulting of use_log_energy Signed-off-by: Ryan * [TTS] Fix TTS export test Signed-off-by: Ryan --------- Signed-off-by: Ryan Signed-off-by: hsiehjackson * Dialogue dataset (#6654) * chatbot interface Signed-off-by: Yi Dong * latest gradio Signed-off-by: Yi Dong * default greedy Signed-off-by: Yi Dong * better chatbot Signed-off-by: Yi Dong * handle preamble Signed-off-by: Yi Dong * added chatbot training capablity Signed-off-by: Yi Dong * added chatbot ui Signed-off-by: Yi Dong * remove debug code Signed-off-by: Yi Dong * default human Signed-off-by: Yi Dong * use special token for roles Signed-off-by: Yi Dong * special tokens Signed-off-by: Yi Dong * fix name Signed-off-by: Yi Dong * new chat dataset Signed-off-by: Yi Dong * fix the system token Signed-off-by: Yi Dong * upgrade gradio Signed-off-by: Yi Dong * save the chat history Signed-off-by: Yi Dong * update ui Signed-off-by: root * update chat interface Signed-off-by: Yi Dong * handles canonical form Signed-off-by: Yi Dong * new sft chatbot Signed-off-by: Yi Dong * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * change format Signed-off-by: Yi Dong * check extra_id in the tokenizer Signed-off-by: Yi Dong * added vocab property check Signed-off-by: Yi Dong * added missing file Signed-off-by: Yi Dong --------- Signed-off-by: Yi Dong Signed-off-by: root Co-authored-by: root Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Sandeep Subramanian Signed-off-by: hsiehjackson * Add support for RNNT/hybrid models to partial transcribe (#6609) * Add support for RNNT/hybrid models to partial transcribe Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * Update transcribe_utils.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * Update transcribe_speech.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * Update transcribe_utils.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * eval_beamsearch_ngram.py with hybrid ctc (#6656) * separate_punctuation = false * ctc decoding strategy = model.decoding * transcribe(files, logprobs=True) returns logprobs --------- Signed-off-by: Nikolay Karpov Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * fix bucketing bug issue for picking new bucket (#6663) Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Koluguri Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Add t5 flash-attention Signed-off-by: hsiehjackson * PE refactor (#6673) * PE refactor Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: hsiehjackson Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Add singleton alibi Signed-off-by: hsiehjackson * Fix FA mask Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * singleton PE Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Fix attn bias inference Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * fix eval Signed-off-by: Evelina Signed-off-by: hsiehjackson * [TTS] Add callback for saving audio during FastPitch training (#6665) * [TTS] Add callback for saving audio during FastPitch training Signed-off-by: Ryan * [TTS] Allow NGC model name for vocoder Signed-off-by: Ryan --------- Signed-off-by: Ryan Signed-off-by: hsiehjackson * update batch size recommendation to min 32 for 43b (#6675) * update batch size recommendation to min 32 for 43b Signed-off-by: Zhilin Wang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Zhilin Wang Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Make Note usage consistent in adapter_mixins.py (#6678) Inconsistent usage of the word Note, which includes a broken reading in one case. I'm just doing some tidying -- not trying to be critical. Signed-off-by: Brian McBrayer Signed-off-by: hsiehjackson * Fix masking bug for TTS Aligner (#6677) Signed-off-by: Jocelyn Huang Signed-off-by: hsiehjackson * [ASR] Adding ssl config for fast-conformer (#6672) * adding ssl config for fast-conformer adding boolean flags for ssl losses Signed-off-by: Krishna Puvvada * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * renaming fast-conformer to fastconformer in config folder Signed-off-by: Krishna Puvvada --------- Signed-off-by: Krishna Puvvada Co-authored-by: Krishna Puvvada Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Fix xpos offset Signed-off-by: hsiehjackson * Fix sequence parallel Signed-off-by: hsiehjackson * Fix parallel Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Uncomment correct bias size Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Remove unused module Signed-off-by: hsiehjackson * Fix singleton tril Signed-off-by: hsiehjackson * Fix kerple/sandwitch rename xpos Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * fix sandwich Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Add unitest Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Fix bug Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Add requirements Signed-off-by: hsiehjackson * Remove requirements Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Remove requirement flash-attn Signed-off-by: hsiehjackson * Fix FA causal for inference Signed-off-by: hsiehjackson * Add experimental PE Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Update all invalid tree references to blobs for NeMo samples (#6679) The tree is invalid as this points to a blob, and the links would not open in colab. Signed-off-by: Brian McBrayer Co-authored-by: Brian McBrayer Signed-off-by: hsiehjackson * Update README.rst about container (#6686) Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com> Signed-off-by: hsiehjackson * Fix a bug, use _ceil_to_nearest instead as _round_to_nearest is not defined (#6681) (#6682) Co-authored-by: Li Tao Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com> Signed-off-by: hsiehjackson * Enable ONNX export of 5B GPT trained with TE FP8 modules (#6458) * add GPT FP8 ONNX export support Signed-off-by: Asfiya Baig * changes 1. Add dynamic axes for inputs 2. Update model input_example to resolve size error by TE Signed-off-by: Asfiya Baig * Conform to Python style guidelines Signed-off-by: Asfiya Baig * refactor to avoid typecasting bf16 string Signed-off-by: Asfiya Baig * fix attribute error in export_utils Signed-off-by: Asfiya Baig * set constant_folding to False by default Signed-off-by: Asfiya Baig * refactor exportable wrapper into model class definition Signed-off-by: Asfiya Baig * remove conditional replacement of modules Signed-off-by: Asfiya Baig * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * set fp8_recipe to None by default Signed-off-by: Asfiya Baig * address all comments Signed-off-by: Asfiya Baig * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * typecast precision check for fp16 Signed-off-by: Asfiya Baig * rename export script Signed-off-by: Asfiya Baig --------- Signed-off-by: Asfiya Baig Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Boris Fomitchev Signed-off-by: hsiehjackson * [TTS] Add script for text preprocessing (#6541) * [TTS] Add script for text preprocessing Signed-off-by: Ryan * [TTS] Use Normalizer.input_case Signed-off-by: Ryan --------- Signed-off-by: Ryan Signed-off-by: hsiehjackson * [TTS] Fix adapter duration issue (#6697) * Fix duration issue Signed-off-by: hsiehjackson * Fix duration issue Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add scale aligner loss Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix bug Signed-off-by: hsiehjackson --------- Signed-off-by: hsiehjackson Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * karpnv/issues6690 (#6705) * add sudo Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * RUN Signed-off-by: Nikolay Karpov --------- Signed-off-by: Nikolay Karpov Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Limit codeql scope (#6710) Signed-off-by: smajumdar Signed-off-by: hsiehjackson * eval fix (#6685) * allows usage of pre-extracted base model Signed-off-by: arendu * extracted model checking and loading Signed-off-by: arendu * style Signed-off-by: arendu * style Signed-off-by: arendu * update Signed-off-by: arendu * removed sft eval script, can use peft eval script for sft models Signed-off-by: arendu --------- Signed-off-by: arendu Signed-off-by: hsiehjackson * Fix k2 installation in Docker with CUDA 12 (#6707) (#6709) Signed-off-by: Vladimir Bataev Co-authored-by: Vladimir Bataev Signed-off-by: hsiehjackson * [TTS] Filter out silent audio files during preprocessing (#6716) Signed-off-by: Ryan Signed-off-by: hsiehjackson * not pinning version (#6680) Signed-off-by: Yi Dong Signed-off-by: hsiehjackson * Tutorial fixes (#6717) (#6718) Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar Signed-off-by: hsiehjackson * preprocess squad in sft format (#6727) * preprocess squad in sft format Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Fix Codeql (#6731) Signed-off-by: smajumdar Signed-off-by: hsiehjackson * [TTS] fix inconsistent type hints for IpaG2p (#6733) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: hsiehjackson * VP Fixes for converter + Config management (#6698) * [Temp] VP Fixes Signed-off-by: smajumdar * Revert logging Signed-off-by: smajumdar --------- Signed-off-by: smajumdar Signed-off-by: hsiehjackson * Graph RNNT: Grid- and Compose-Transducer. W-Transducer loss (#6168) * add GraphTransducerLossBase abstract class with the interface for Graph-based loses * add RNN-T implementation in GraphRnntLoss with tests * add W-Transducer implementation in GraphWTransducerLoss with tests * add GraphRnntLoss + GraphWTransducerLoss to RNN-T loss resolver --------- Signed-off-by: Vladimir Bataev Signed-off-by: hsiehjackson * Fix fastpitch test nightly (#6730) * fix test fastpitch nightly Signed-off-by: hsiehjackson * Reformat Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix if elif condition Signed-off-by: hsiehjackson --------- Signed-off-by: hsiehjackson Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Fix for interctc test random failure (#6644) Signed-off-by: Igor Gitman Signed-off-by: hsiehjackson * check for first or last stage (#6708) (#6743) * check for first or last stage * remove redundant check * fix typo * add map_location --------- Signed-off-by: ericharper Co-authored-by: Eric Harper Signed-off-by: hsiehjackson * sharded manifests docs (#6751) Signed-off-by: Dima Rekesh Co-authored-by: Dima Rekesh Signed-off-by: hsiehjackson * [TTS] relax hardcoded prefix for phonemes and tones and infer phoneme set through dict (#6735) * [TTS] relax hardcoded prefix for phonemes and tones and infer phoneme set through dict. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * None checks for prefix. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --------- Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: hsiehjackson * [TTS] corrected misleading deprecation warnings. (#6702) * [TTS] corrected misleading deprecation warnings. * deprecation warning is only triggered when old models applied old g2p. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --------- Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: hsiehjackson * Bug fix to restore act ckpt (#6753) (#6755) * Bug fix to restore act ckpt * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Markel Sanz Ausin Co-authored-by: Markel Sanz Ausin Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Bug fix to reset sequence parallelism (#6756) (#6770) * Bug fix to reset sequence parallelism * Update seq par reset/restore * Add nested loop --------- Signed-off-by: Markel Sanz Ausin Co-authored-by: Markel Sanz Ausin Signed-off-by: hsiehjackson * Fix TTS adapter tutorial (#6741) * Fix adapter tutorial Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix typos Signed-off-by: hsiehjackson --------- Signed-off-by: hsiehjackson Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Fix checkpointed forward and add test for full activation checkpointing (#6744) (#6771) * fix checkpointed forward and add test for full activation checkpointing * add method * add method --------- Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Signed-off-by: hsiehjackson * lora notebook (#6765) * lora training Signed-off-by: arendu * update Signed-off-by: arendu --------- Signed-off-by: arendu Signed-off-by: hsiehjackson * Fix Links (#6777) (#6778) Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar Signed-off-by: hsiehjackson * Remove alibi tril Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Add flash-attn requirement Signed-off-by: hsiehjackson * revert sft dataset changes Signed-off-by: Evelina Signed-off-by: hsiehjackson * Move flash-attn requirement Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Add install Signed-off-by: hsiehjackson * peft eval directly from ckpt (#6785) * update to load from ckpt Signed-off-by: arendu * update Signed-off-by: arendu * load ckpt peft model Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update style Signed-off-by: arendu --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Add Frame-VAD examples and utils (#6463) * add model, dataset, necessary utils and tests Signed-off-by: stevehuang52 * fix tarred data Signed-off-by: stevehuang52 * fix typo Signed-off-by: stevehuang52 * add fvad examples and update utils Signed-off-by: stevehuang52 * add copyright Signed-off-by: stevehuang52 * refactor and add tests Signed-off-by: stevehuang52 * update dataset Signed-off-by: stevehuang52 * update test Signed-off-by: stevehuang52 * refactor Signed-off-by: stevehuang52 * refactor Signed-off-by: stevehuang52 * fix typos Signed-off-by: stevehuang52 --------- Signed-off-by: stevehuang52 Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> Co-authored-by: Taejin Park Signed-off-by: hsiehjackson * [TTS][zh] refine hardcoded lowercase for ASCII letters. (#6781) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: hsiehjackson * Revert evaluation Signed-off-by: hsiehjackson * Revert evaluation Signed-off-by: hsiehjackson * Fix Signed-off-by: hsiehjackson * Fix gpu Signed-off-by: hsiehjackson * Spellchecking ASR customization model (#6179) * bug fixes Signed-off-by: Alexandra Antonova * fix bugs, add preparation and evaluation scripts, add readme Signed-off-by: Alexandra Antonova * small fixes Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add real coverage calculation, small fixes, more debug information Signed-off-by: Alexandra Antonova * add option to pass a filelist and output folder - to handle inference from multiple input files Signed-off-by: Alexandra Antonova * added preprocessing for yago wikipedia articles - finding yago entities and their subphrases Signed-off-by: Alexandra Antonova * yago wiki preprocessing, sampling, pseudonormalization Signed-off-by: Alexandra Antonova * more scripts for preparation of training examples Signed-off-by: Alexandra Antonova * bug fixes Signed-off-by: Alexandra Antonova * add some alphabet checks Signed-off-by: Alexandra Antonova * add bert on subwords, concatenate it to bert on characters Signed-off-by: Alexandra Antonova * add calculation of character_pos_to_subword_pos Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * pdb Signed-off-by: Alexandra Antonova * tensor join bug fix Signed-off-by: Alexandra Antonova * double hidden_size in classifier Signed-off-by: Alexandra Antonova * pdb Signed-off-by: Alexandra Antonova * default index value 0 instead of -1 because index cannot be negative Signed-off-by: Alexandra Antonova * pad index value 0 instead of -1 because index cannot be negative Signed-off-by: Alexandra Antonova * remove pdb Signed-off-by: Alexandra Antonova * fix bugs, add creation of tarred dataset Signed-off-by: Alexandra Antonova * add possibility to change sequence len at inference Signed-off-by: Alexandra Antonova * change sampling of dummy candidates at inference, add candidate info file Signed-off-by: Alexandra Antonova * fix import Signed-off-by: Alexandra Antonova * fix bug Signed-off-by: Alexandra Antonova * update transcription now uses info Signed-off-by: Alexandra Antonova * write path Signed-off-by: Alexandra Antonova * 1. add tarred dataset support(untested). 2. fix bug with ban_ngrams in indexing Signed-off-by: Alexandra Antonova * skip short_sent if no real candidates Signed-off-by: Alexandra Antonova * fix import Signed-off-by: Alexandra Antonova * add braceexpand Signed-off-by: Alexandra Antonova * fixes Signed-off-by: Alexandra Antonova * fix bug Signed-off-by: Alexandra Antonova * fix bug Signed-off-by: Alexandra Antonova * fix bug in np.ones Signed-off-by: Alexandra Antonova * fix bug in collate Signed-off-by: Alexandra Antonova * change tensor type to long because of error in torch.gather Signed-off-by: Alexandra Antonova * fix for empty spans tensor Signed-off-by: Alexandra Antonova * same fixes in _collate_fn for tarred dataset Signed-off-by: Alexandra Antonova * fix bug from previous commit Signed-off-by: Alexandra Antonova * change int types to be shorter to minimize tar size Signed-off-by: Alexandra Antonova * refactoring of datasets and inference Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * tar by 100k examples, small fixes Signed-off-by: Alexandra Antonova * small fixes, add analytics script Signed-off-by: Alexandra Antonova * Add functions for dynamic programming comparison to get best path by ngrams Signed-off-by: Alexandra Antonova * fixes Signed-off-by: Alexandra Antonova * small fix Signed-off-by: Alexandra Antonova * fixes to support testing on SPGISpeech Signed-off-by: Alexandra Antonova * add preprocessing for userlibri Signed-off-by: Alexandra Antonova * some refactoring Signed-off-by: Alexandra Antonova * some refactoring Signed-off-by: Alexandra Antonova * move some functions to utils to reuse from other project Signed-off-by: Alexandra Antonova * move some functions to utils to reuse from other project Signed-off-by: Alexandra Antonova * move some functions to utils to reuse from other project Signed-off-by: Alexandra Antonova * small refactoring before pr. Add bash-scripts reproducing evaluation Signed-off-by: Alexandra Antonova * style fix Signed-off-by: Alexandra Antonova * small fixes in inference Signed-off-by: Alexandra Antonova * bug fix - didn't move window on last symbol Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug - shuffle was before truncation of sorted candidates Signed-off-by: Alexandra Antonova * refactoring, fix some bugs Signed-off-by: Alexandra Antonova * variour fixes. Add word_indices at inference Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add candidate positions Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Move data preparation and evaluation to other repo Signed-off-by: Alexandra Antonova * add infer_reproduce_paper. Refactoring Signed-off-by: Alexandra Antonova * refactor inference using fragment indices Signed-off-by: Alexandra Antonova * add some helper functions Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug with parameters order Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bugs Signed-off-by: Alexandra Antonova * refactoring, fix bug Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add multiple variants of adjusting start/end positions Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * more fixes Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add unit tests, other fixes Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Alexandra Antonova * fix CodeQl warnings Signed-off-by: Alexandra Antonova * bug fixes Signed-off-by: Alexandra Antonova * fix bugs, add preparation and evaluation scripts, add readme Signed-off-by: Alexandra Antonova * small fixes Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add real coverage calculation, small fixes, more debug information Signed-off-by: Alexandra Antonova * add option to pass a filelist and output folder - to handle inference from multiple input files Signed-off-by: Alexandra Antonova * added preprocessing for yago wikipedia articles - finding yago entities and their subphrases Signed-off-by: Alexandra Antonova * yago wiki preprocessing, sampling, pseudonormalization Signed-off-by: Alexandra Antonova * more scripts for preparation of training examples Signed-off-by: Alexandra Antonova * bug fixes Signed-off-by: Alexandra Antonova * add some alphabet checks Signed-off-by: Alexandra Antonova * add bert on subwords, concatenate it to bert on characters Signed-off-by: Alexandra Antonova * add calculation of character_pos_to_subword_pos Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * pdb Signed-off-by: Alexandra Antonova * tensor join bug fix Signed-off-by: Alexandra Antonova * double hidden_size in classifier Signed-off-by: Alexandra Antonova * pdb Signed-off-by: Alexandra Antonova * default index value 0 instead of -1 because index cannot be negative Signed-off-by: Alexandra Antonova * pad index value 0 instead of -1 because index cannot be negative Signed-off-by: Alexandra Antonova * remove pdb Signed-off-by: Alexandra Antonova * fix bugs, add creation of tarred dataset Signed-off-by: Alexandra Antonova * add possibility to change sequence len at inference Signed-off-by: Alexandra Antonova * change sampling of dummy candidates at inference, add candidate info file Signed-off-by: Alexandra Antonova * fix import Signed-off-by: Alexandra Antonova * fix bug Signed-off-by: Alexandra Antonova * update transcription now uses info Signed-off-by: Alexandra Antonova * write path Signed-off-by: Alexandra Antonova * 1. add tarred dataset support(untested). 2. fix bug with ban_ngrams in indexing Signed-off-by: Alexandra Antonova * skip short_sent if no real candidates Signed-off-by: Alexandra Antonova * fix import Signed-off-by: Alexandra Antonova * add braceexpand Signed-off-by: Alexandra Antonova * fixes Signed-off-by: Alexandra Antonova * fix bug Signed-off-by: Alexandra Antonova * fix bug Signed-off-by: Alexandra Antonova * fix bug in np.ones Signed-off-by: Alexandra Antonova * fix bug in collate Signed-off-by: Alexandra Antonova * change tensor type to long because of error in torch.gather Signed-off-by: Alexandra Antonova * fix for empty spans tensor Signed-off-by: Alexandra Antonova * same fixes in _collate_fn for tarred dataset Signed-off-by: Alexandra Antonova * fix bug from previous commit Signed-off-by: Alexandra Antonova * change int types to be shorter to minimize tar size Signed-off-by: Alexandra Antonova * refactoring of datasets and inference Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * bug fix Signed-off-by: Alexandra Antonova * tar by 100k examples, small fixes Signed-off-by: Alexandra Antonova * small fixes, add analytics script Signed-off-by: Alexandra Antonova * Add functions for dynamic programming comparison to get best path by ngrams Signed-off-by: Alexandra Antonova * fixes Signed-off-by: Alexandra Antonova * small fix Signed-off-by: Alexandra Antonova * fixes to support testing on SPGISpeech Signed-off-by: Alexandra Antonova * add preprocessing for userlibri Signed-off-by: Alexandra Antonova * some refactoring Signed-off-by: Alexandra Antonova * some refactoring Signed-off-by: Alexandra Antonova * move some functions to utils to reuse from other project Signed-off-by: Alexandra Antonova * move some functions to utils to reuse from other project Signed-off-by: Alexandra Antonova * move some functions to utils to reuse from other project Signed-off-by: Alexandra Antonova * small refactoring before pr. Add bash-scripts reproducing evaluation Signed-off-by: Alexandra Antonova * style fix Signed-off-by: Alexandra Antonova * small fixes in inference Signed-off-by: Alexandra Antonova * bug fix - didn't move window on last symbol Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug - shuffle was before truncation of sorted candidates Signed-off-by: Alexandra Antonova * refactoring, fix some bugs Signed-off-by: Alexandra Antonova * variour fixes. Add word_indices at inference Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add candidate positions Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Move data preparation and evaluation to other repo Signed-off-by: Alexandra Antonova * add infer_reproduce_paper. Refactoring Signed-off-by: Alexandra Antonova * refactor inference using fragment indices Signed-off-by: Alexandra Antonova * add some helper functions Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug with parameters order Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bugs Signed-off-by: Alexandra Antonova * refactoring, fix bug Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add multiple variants of adjusting start/end positions Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * more fixes Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add unit tests, other fixes Signed-off-by: Alexandra Antonova * fix Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix CodeQl warnings Signed-off-by: Alexandra Antonova * add script for full inference pipeline, refactoring Signed-off-by: Alexandra Antonova * add tutorial Signed-off-by: Alexandra Antonova * take example data from HuggingFace Signed-off-by: Alexandra Antonova * add docs Signed-off-by: Alexandra Antonova * fix comment Signed-off-by: Alexandra Antonova * fix bug Signed-off-by: Alexandra Antonova * small fixes for PR Signed-off-by: Alexandra Antonova * add some more tests Signed-off-by: Alexandra Antonova * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * try to fix tests adding with_downloads Signed-off-by: Alexandra Antonova * skip tests with tokenizer download Signed-off-by: Alexandra Antonova --------- Signed-off-by: Alexandra Antonova Signed-off-by: Alexandra Antonova Co-authored-by: Alexandra Antonova Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hsiehjackson * Fix test Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Fix device Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Revert Signed-off-by: hsiehjackson * clean Signed-off-by: hsiehjackson * Change device Signed-off-by: hsiehjackson * Change device Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add test FA Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add CI Signed-off-by: hsiehjackson * Fix yaml order Signed-off-by: hsiehjackson * Test random attention mask Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add install FA for tests Signed-off-by: hsiehjackson * cherry pick 6788 (#6816) * cherry pick 6788 Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Support 2D mask Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add missing comp_att_mask arg Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix code ql Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Megatron MPT-7B Support (#6804) * Initial commit of MPT-7B functionality Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Added various fixes requested by reviewers Signed-off-by: Daniel Egert * Added conversion script for mpt-7b to Nemo Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Added small note about TP and PP values Signed-off-by: Daniel Egert * Replaced all print statements with Nemo logging Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Daniel Egert Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Fix test triton Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update FA in CI Signed-off-by: hsiehjackson * Fix Jenkin error Signed-off-by: hsiehjackson * Resume with FA Signed-off-by: hsiehjackson * Follow comments Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix README Signed-off-by: hsiehjackson * Fix README Signed-off-by: hsiehjackson * Remove torch.cuda Signed-off-by: hsiehjackson * Remove unused import Signed-off-by: hsiehjackson * kerple init Signed-off-by: hsiehjackson * Add TE comment Signed-off-by: hsiehjackson * Fix error when inference.compute_attention_mask=False Signed-off-by: hsiehjackson --------- Signed-off-by: Abhinav Khattar Signed-off-by: hsiehjackson Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Dima Rekesh Signed-off-by: Jim O’Regan Signed-off-by: smajumdar Signed-off-by: Mostafa Ghorbandoost Signed-off-by: Dmytro Pykhtar Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Signed-off-by: Micha Livne Signed-off-by: Kunal Dhawan Signed-off-by: andrusenkoau Signed-off-by: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com> Signed-off-by: Evelina Signed-off-by: Ryan Signed-off-by: Taejin Park Signed-off-by: Ante Jukić Signed-off-by: Daniel Egert Signed-off-by: stevehuang52 Signed-off-by: Vahid Signed-off-by: KunalDhawan Signed-off-by: MaximumEntropy Signed-off-by: fayejf Signed-off-by: Nikolay Karpov Signed-off-by: Nikolay Karpov Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Signed-off-by: Greg Clark Signed-off-by: arendu Signed-off-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com> Signed-off-by: Vladimir Bataev Signed-off-by: Elena Rastorgueva Signed-off-by: Igor Gitman Signed-off-by: Sangkug Lym Signed-off-by: sam1373 Signed-off-by: Boris Fomitchev Signed-off-by: Mikołaj Błaż Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com> Signed-off-by: Tim Moon Signed-off-by: Eric Harper Signed-off-by: ericharper Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Signed-off-by: Yi Dong Signed-off-by: root Signed-off-by: Nithin Rao Koluguri Signed-off-by: Zhilin Wang Signed-off-by: Brian McBrayer Signed-off-by: Jocelyn Huang Signed-off-by: Krishna Puvvada Signed-off-by: Asfiya Baig Signed-off-by: Dima Rekesh Signed-off-by: Markel Sanz Ausin Signed-off-by: Alexandra Antonova Signed-off-by: Alexandra Antonova Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Abhinav Khattar Co-authored-by: Micha Livne Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Dima Rekesh Co-authored-by: Jim O’Regan Co-authored-by: Somshubra Majumdar Co-authored-by: Mostafa Ghorbandoost Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar Co-authored-by: Eric Harper Co-authored-by: Micha Livne Co-authored-by: Kunal Dhawan Co-authored-by: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com> Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com> Co-authored-by: George Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> Co-authored-by: Ryan Langman Co-authored-by: Slyne Deng Co-authored-by: Taejin Park Co-authored-by: Jagadeesh Balam <4916480+jbalam-nv@users.noreply.github.com> Co-authored-by: anteju <108555623+anteju@users.noreply.github.com> Co-authored-by: trias702 <25867060+trias702@users.noreply.github.com> Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: Vahid Noroozi Co-authored-by: Sandeep Subramanian Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> Co-authored-by: Nikolay Karpov Co-authored-by: Nikolay Karpov Co-authored-by: Hoo Chang Shin Co-authored-by: Greg Clark Co-authored-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com> Co-authored-by: Sangkug Lym Co-authored-by: Vladimir Bataev Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> Co-authored-by: Igor Gitman Co-authored-by: Samuel Kriman Co-authored-by: Boris Fomitchev Co-authored-by: mikolajblaz Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: Adi Renduchintala Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com> Co-authored-by: root Co-authored-by: Nithin Rao Co-authored-by: Evelina Co-authored-by: Zhilin Wang Co-authored-by: Brian McBrayer Co-authored-by: Jocelyn Co-authored-by: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com> Co-authored-by: Krishna Puvvada Co-authored-by: Brian McBrayer Co-authored-by: Li Tao Co-authored-by: asfiyab-nvidia <117682710+asfiyab-nvidia@users.noreply.github.com> Co-authored-by: Dima Rekesh Co-authored-by: Markel Sanz Ausin Co-authored-by: bene-ges Co-authored-by: Alexandra Antonova --- Dockerfile | 5 + Jenkinsfile | 346 ++++++++++++++++++ README.rst | 10 + .../conf/megatron_gpt_config.yaml | 5 +- .../conf/megatron_model_base_config.yaml | 3 +- .../conf/megatron_gpt_peft_eval_config.yaml | 3 +- .../language_modeling/megatron/gpt_model.py | 4 + .../language_modeling/megatron_base_model.py | 14 +- .../language_modeling/megatron_gpt_model.py | 10 +- .../megatron_gpt_prompt_learning_model.py | 1 + .../nlp/modules/common/megatron/attention.py | 328 +++++++++++------ .../modules/common/megatron/language_model.py | 95 ++++- .../modules/common/megatron/layer_norm_1p.py | 30 +- .../common/megatron/megatron_decoders.py | 4 + .../common/megatron/megatron_encoders.py | 4 + .../megatron/megatron_transformer_decoder.py | 4 + .../megatron/megatron_transformer_encoder.py | 14 +- .../megatron/position_embedding/__init__.py | 31 ++ .../alibi_relative_position_embedding.py | 50 ++- .../kerple_relative_position_embedding.py | 19 +- .../rotary_position_embedding.py} | 3 +- .../sandwich_relative_position_embedding.py | 75 ++++ .../t5_relative_position_embedding.py | 9 +- .../xpos_position_embedding.py | 78 ++++ .../common/megatron/retrieval_transformer.py | 2 +- .../megatron/token_level_encoder_decoder.py | 27 +- .../modules/common/megatron/transformer.py | 43 ++- .../nlp/modules/common/megatron/utils.py | 31 +- .../common/text_generation_strategy.py | 34 +- .../modules/common/text_generation_utils.py | 11 +- .../ngram_lm/create_lexicon_from_arpa.py | 153 ++++---- .../convert_mpt_7b_hf_to_nemo.py | 212 +++++++++++ tests/collections/nlp/test_flash_attention.py | 247 +++++++++++++ .../nlp/test_position_embedding.py | 211 +++++++++++ .../collections/nlp/test_retrieval_module.py | 2 +- .../nlp/test_retrieval_module_inference.py | 2 +- 36 files changed, 1843 insertions(+), 277 deletions(-) create mode 100644 nemo/collections/nlp/modules/common/megatron/position_embedding/__init__.py rename nemo/collections/nlp/modules/common/megatron/{ => position_embedding}/alibi_relative_position_embedding.py (73%) rename nemo/collections/nlp/modules/common/megatron/{ => position_embedding}/kerple_relative_position_embedding.py (81%) rename nemo/collections/nlp/modules/common/megatron/{rotary_pos_embedding.py => position_embedding/rotary_position_embedding.py} (96%) create mode 100644 nemo/collections/nlp/modules/common/megatron/position_embedding/sandwich_relative_position_embedding.py rename nemo/collections/nlp/modules/common/megatron/{ => position_embedding}/t5_relative_position_embedding.py (95%) create mode 100644 nemo/collections/nlp/modules/common/megatron/position_embedding/xpos_position_embedding.py create mode 100644 scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py create mode 100644 tests/collections/nlp/test_flash_attention.py create mode 100644 tests/collections/nlp/test_position_embedding.py diff --git a/Dockerfile b/Dockerfile index 82d16a561886..7722555357b2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -72,6 +72,11 @@ WORKDIR /tmp/nemo COPY requirements . RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done +# install flash attention dependencies +RUN pip install flash-attn +# pinned triton version for flash-attention https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3 +RUN pip install triton==2.0.0.dev20221202 + # install k2, skip if installation fails COPY scripts /tmp/nemo/scripts/ RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/speech_recognition/k2/setup.sh); INSTALL_CODE=$?; \ diff --git a/Jenkinsfile b/Jenkinsfile index d16379cabb8a..d335378173f0 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -65,6 +65,14 @@ pipeline { pip install -e .' } } + + stage('Flash Attention installation') { + steps { + // pinned triton version for flash-attention https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3 + sh 'pip install flash-attn && \ + pip install triton==2.0.0.dev20221202' + } + } stage('PyTorch Lightning version') { steps { @@ -3144,6 +3152,88 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' } } stage('L2: Megatron GPT Pretraining and Resume Training TP=2') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + steps { + sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_granularity='full' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=1 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_granularity='full' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" + sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" + } + } + stage('L2: Megatron GPT with Rope Pretraining and Resume Training TP=2') { when { anyOf { branch 'main' @@ -3229,6 +3319,262 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" } } + stage('L2: Megatron GPT with Rope Pretraining using Flash Attention and Resume Training TP=2') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + steps { + sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=rope \ + model.rotary_percentage=0.5 \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_granularity='full' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ + model.use_flash_attention=True" + sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=1 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=rope \ + model.rotary_percentage=0.5 \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_granularity='full' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ + model.use_flash_attention=True" + sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" + sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" + } + } + stage('L2: Megatron GPT with ALiBi Pretraining and Resume Training TP=2') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + steps { + sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=alibi \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_granularity='full' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=1 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=alibi \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_granularity='full' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" + sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" + } + } + stage('L2: Megatron GPT with KERPLE Pretraining and Resume Training TP=2') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + steps { + sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=kerple \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_granularity='full' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=1 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=kerple \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_granularity='full' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" + sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" + } + } stage('L2: Megatron GPT Pretraining and Resume Training PP=2') { when { anyOf { diff --git a/README.rst b/README.rst index b9ba7fce30f3..863b279b2be8 100644 --- a/README.rst +++ b/README.rst @@ -280,6 +280,16 @@ It is highly recommended to use the NVIDIA PyTorch or NeMo container if having i Transformer Engine requires PyTorch to be built with CUDA 11.8. + +Flash Attention +~~~~~~~~~~~~~~~~~~~~ +Transformer Engine already supports Flash Attention for GPT models. If you want to use Flash Attention for non-causal models or use with attention bias (introduced from position encoding, e.g. Alibi), please install `flash-attn `_. + +.. code-block:: bash + + pip install flash-attn + pip install triton==2.0.0.dev20221202 + NeMo Text Processing ~~~~~~~~~~~~~~~~~~~~ NeMo Text Processing, specifically (Inverse) Text Normalization, is now a separate repository `https://github.com/NVIDIA/NeMo-text-processing `_. diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index d502f255bd8e..d1132a32349a 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -77,7 +77,7 @@ model: transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer'] openai_gelu: False # Use OpenAI's GELU instead of the default GeLU normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. - position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope'] + position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental. rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this. attention_type: 'multihead' # Attention type. Options ['multihead'] share_embeddings_and_output_weights: True # Share embedding and output layer weights. @@ -167,6 +167,9 @@ model: reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. + ## Flash Attention + use_flash_attention: False # Use flash attention in self-attention module, this config does nothing when transformer_engine=True + data: # Path to data must be specified by the user. # Supports List, String and Dictionary diff --git a/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml b/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml index d3feb97ea9b4..e98ebae6da63 100644 --- a/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml @@ -36,4 +36,5 @@ megatron_legacy: False # Whether to use the legacy Megatron model. This affects normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. num_moe_experts: 1 # When >1, FFNs are changed to MoE layers moe_frequency: 1 # every Nth ffn layer will be made MoE -moe_dropout: 0.0 # Dropout value for MoE layers \ No newline at end of file +moe_dropout: 0.0 # Dropout value for MoE layers +use_flash_attention: false # Use flash attention in self-attention module \ No newline at end of file diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml index 69dc17f244f5..8c21117969ab 100755 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml @@ -129,4 +129,5 @@ inference: repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - outfile_path: output.txt \ No newline at end of file + outfile_path: output.txt + compute_attention_mask: True \ No newline at end of file diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py index e890e6ae4807..b43dc98f2fe7 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py @@ -151,6 +151,7 @@ def __init__( gradient_accumulation_fusion=False, persist_layer_norm=False, openai_gelu=False, + megatron_legacy=False, onnx_safe=False, sequence_parallel=False, transformer_engine=False, @@ -163,6 +164,7 @@ def __init__( fp8_amax_compute_algo='most_recent', reduce_amax=True, use_emha=False, + use_flash_attention=False, ): super(GPTModel, self).__init__(share_token_embeddings=share_embeddings_and_output_weights) @@ -232,6 +234,7 @@ def __init__( persist_layer_norm=persist_layer_norm, openai_gelu=openai_gelu, onnx_safe=onnx_safe, + megatron_legacy=megatron_legacy, sequence_parallel=sequence_parallel, transformer_engine=transformer_engine, fp8=fp8, @@ -243,6 +246,7 @@ def __init__( fp8_amax_compute_algo=fp8_amax_compute_algo, reduce_amax=reduce_amax, use_emha=use_emha, + use_flash_attention=use_flash_attention, ) if self.share_embeddings_and_output_weights: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 2568a14f8dbf..7be679376175 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -25,6 +25,7 @@ from pytorch_lightning.trainer.trainer import Trainer from nemo.collections.nlp.models.nlp_model import NLPModel +from nemo.collections.nlp.modules.common.megatron.attention import HAVE_FLASH_ATTENTION from nemo.collections.nlp.modules.common.megatron.clip_grads import ( clip_grad_norm_distributed_optimizer, clip_grad_norm_fp32, @@ -84,6 +85,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): if trainer is None: raise ValueError(f"Trainer cannot be None for Megatron-based models. Please provide a PTL trainer object.") + if cfg.get('use_flash_attention', False) and not HAVE_FLASH_ATTENTION: + raise ImportError( + "flash_attn was not found. Please see the installation instructions: https://github.com/HazyResearch/flash-attention." + "If you use flash_attn with triton. Please install triton==2.0.0.dev20221202." + ) + # this prevents base constructor from initializing tokenizer self.tokenizer = None @@ -205,9 +212,10 @@ def _build_tokenizer(self): self.tokenizer = get_nmt_tokenizer( library=self._cfg.tokenizer.library, model_name=self._cfg.tokenizer.type, - tokenizer_model=self.register_artifact("tokenizer.model", self._cfg.tokenizer.model), - vocab_file=self.register_artifact("tokenizer.vocab_file", self._cfg.tokenizer.vocab_file), - merges_file=self.register_artifact("tokenizer.merge_file", self._cfg.tokenizer.merge_file), + tokenizer_model=self.register_artifact("tokenizer.model", self._cfg.tokenizer.get('model', None)), + vocab_file=self.register_artifact("tokenizer.vocab_file", self._cfg.tokenizer.get('vocab_file', None)), + merges_file=self.register_artifact("tokenizer.merge_file", self._cfg.tokenizer.get('merge_file', None)), + use_fast=self.cfg.tokenizer.get('use_fast', False), delimiter=self.cfg.tokenizer.get('delimiter', None), legacy=legacy, ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 8eff896cf9d8..853c637eb3b3 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -300,7 +300,7 @@ def get_inference_config(self): def model_provider_func(self, pre_process, post_process): """Model depends on pipeline paralellism.""" model = GPTModel( - vocab_size=self.padded_vocab_size, + vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), hidden_size=self.cfg.hidden_size, max_position_embeddings=self.cfg.max_position_embeddings, num_layers=self.cfg.num_layers, @@ -357,6 +357,8 @@ def model_provider_func(self, pre_process, post_process): fp8_amax_compute_algo=self.cfg.get('fp8_amax_compute_algo', 'most_recent'), reduce_amax=self.cfg.get('reduce_amax', True), use_emha=self.cfg.get('use_emha', False), + use_flash_attention=self.cfg.get('use_flash_attention', False), + megatron_legacy=self.cfg.get('megatron_legacy', False), ) return model @@ -765,7 +767,6 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_ if self.get_attention_mask_from_fusion: required_keys.remove('attention_mask') batch = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in batch.items()} - # Model forward pass output_tensor = model( batch['tokens'], @@ -822,9 +823,10 @@ def fwd_output_only_func(dataloader_iter, model): inference_max_sequence_len, ) = batch tokens = tokens.cuda() - attention_mask = attention_mask.cuda() position_ids = position_ids.cuda() - attention_mask = attention_mask[0:1] + if attention_mask is not None: + attention_mask = attention_mask.cuda() + attention_mask = attention_mask[0:1] extra_arg['set_inference_key_value_memory'] = set_inference_key_value_memory[0].item() extra_arg['inference_max_sequence_len'] = inference_max_sequence_len[0].item() output_tensor = model(tokens, position_ids, attention_mask, **extra_arg) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py index 95448e67bd11..81ca1c283ad0 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py @@ -753,6 +753,7 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] "add_BOS": inference_config["add_BOS"], "all_probs": inference_config["all_probs"], "compute_logprob": inference_config["compute_logprob"], + "compute_attention_mask": inference_config.get("compute_attention_mask", True), } task_ids, processed_inputs = batch diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py index 9c954b5e6313..b0d98e0c2fb1 100644 --- a/nemo/collections/nlp/modules/common/megatron/attention.py +++ b/nemo/collections/nlp/modules/common/megatron/attention.py @@ -27,8 +27,15 @@ ) from nemo.collections.nlp.modules.common.megatron.fused_softmax import MatchedScaleMaskSoftmax from nemo.collections.nlp.modules.common.megatron.module import MegatronModule -from nemo.collections.nlp.modules.common.megatron.rotary_pos_embedding import apply_rotary_pos_emb -from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults, attention_mask_func +from nemo.collections.nlp.modules.common.megatron.position_embedding import XPOSPositionEmbedding +from nemo.collections.nlp.modules.common.megatron.position_embedding.rotary_position_embedding import ( + apply_rotary_pos_emb, +) +from nemo.collections.nlp.modules.common.megatron.utils import ( + ApexGuardDefaults, + _cast_if_autocast_enabled, + attention_mask_func, +) from nemo.collections.nlp.parts import utils_funcs from nemo.core import adapter_mixins @@ -55,6 +62,20 @@ HAVE_MEGATRON_CORE = False +try: + from flash_attn.bert_padding import pad_input, unpad_input + from flash_attn.flash_attn_interface import flash_attn_unpadded_func + from flash_attn.flash_attn_triton import flash_attn_func + + HAVE_FLASH_ATTENTION = True + +except (ImportError, ModuleNotFoundError): + + HAVE_FLASH_ATTENTION = False + + flash_attn_unpadded_func, flash_attn_func = None, None + unpad_input, pad_input = None, None + """ We use the following notation throughout this file: h: hidden size n: number of attention heads @@ -104,9 +125,9 @@ def __init__( sequence_parallel=False, gradient_accumulation_fusion=False, normalize_attention_scores=True, + use_flash_attention=False, ): super(ParallelAttention, self).__init__() - self.layer_number = max(1, layer_number) self.attention_type = attention_type self.attn_mask_type = attn_mask_type @@ -201,6 +222,8 @@ def __init__( multi_query_attention=multi_query_attention, sequence_parallel=sequence_parallel, normalize_attention_scores=normalize_attention_scores, + position_embedding_type=position_embedding_type, + use_flash_attention=use_flash_attention, ) # Output. @@ -292,14 +315,14 @@ def custom_forward(*inputs): return hidden_states - def _allocate_memory(self, inference_max_sequence_len, batch_size, dtype): + def _allocate_memory(self, inference_max_sequence_len, batch_size, dtype, device): return torch.empty( inference_max_sequence_len, batch_size, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head, dtype=dtype, - device=torch.cuda.current_device(), + device=device, ) def _transpose_last_dim(self, mixed_layer, num_splits, num_splits_first): @@ -357,10 +380,10 @@ def forward( if set_inference_key_value_memory: assert inference_max_sequence_len and inference_max_sequence_len > 0 self.inference_key_memory = self._allocate_memory( - inference_max_sequence_len, hidden_states.size(1), hidden_states.dtype + inference_max_sequence_len, hidden_states.size(1), hidden_states.dtype, hidden_states.device ) self.inference_value_memory = self._allocate_memory( - inference_max_sequence_len, hidden_states.size(1), hidden_states.dtype + inference_max_sequence_len, hidden_states.size(1), hidden_states.dtype, hidden_states.device ) self.inference_current_sequence_len = 0 @@ -469,7 +492,8 @@ def forward( key_layer = self.inference_key_memory[:end, ...] value_layer = self.inference_value_memory[:end, ...] # Adjust attention mask - attention_mask = attention_mask[..., start:end, :end] + if attention_mask is not None: + attention_mask = attention_mask[..., start:end, :end] # adjust the key rotary positional embedding if rotary_pos_emb is not None: q_pos_emb, k_pos_emb = rotary_pos_emb @@ -711,6 +735,8 @@ def __init__( sequence_parallel=False, normalize_attention_scores=True, multi_query_attention=False, + position_embedding_type='learned_absolute', + use_flash_attention=False, ): super(CoreAttention, self).__init__() @@ -723,6 +749,7 @@ def __init__( elif int(precision) == 16: self.fp16 = True self.multi_query_attention = multi_query_attention + self.position_embedding_type = position_embedding_type self.apply_query_key_layer_scaling = apply_query_key_layer_scaling self.attention_softmax_in_fp32 = False @@ -772,8 +799,17 @@ def __init__( # Dropout. Note that for a single iteration, this layer will generate # different outputs on different number of parallel partitions but # on average it should not be partition dependent. + self.attention_dropout_p = attention_dropout self.attention_dropout = torch.nn.Dropout(attention_dropout) + if use_flash_attention: + self.attn_fn = self.flash_attention + else: + self.attn_fn = self.torch_attention + + if position_embedding_type.lower() == 'xpos': + self.xpos = XPOSPositionEmbedding(kv_channels) + def forward( self, query_layer, @@ -786,19 +822,43 @@ def forward( relative_position_bias=None, headscale_tensor=None, ): + b, np, sq, sk, hn = ( + query_layer.size(1), + query_layer.size(2), + query_layer.size(0), + key_layer.size(0), + query_layer.size(3), + ) - # =================================== - # Raw attention scores. [b, np, s, s] - # =================================== + # ================================================== + # Update attention mask for inference. [b, np, sq, sk] + # ================================================== + if get_key_value: + with torch.no_grad(): + if layer_past is not None: + attention_mask = attention_mask[..., sq - 1, :sk].unsqueeze(2) + else: + attention_mask = attention_mask[..., :sq, :sk] - # [b, np, sq, sk] - output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0)) + # ================================================== + # Update attention bias. [b, np, sq, sk] + # ================================================== + if relative_position_bias is not None: + relative_position_bias = relative_position_bias[ + :, + self.num_attention_heads_partition_offset : self.num_attention_heads_partition_offset + + self.num_attention_heads_per_partition, + -sq:, + -sk:, + ] + # ================================================== + # Update query_layer, key_layer, value_layer + # ================================================== # TODO: figure out how to do this # apply relative positional encoding (rotary embedding) if rotary_pos_emb is not None: q_pos_emb, k_pos_emb = rotary_pos_emb - query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb) key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb) # TODO, can apply positional embedding to value_layer so it has @@ -806,86 +866,67 @@ def forward( # otherwise, only relative positional embedding takes effect # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb) - if self.multi_query_attention: - # [sq, b, np, hn] -> [b, np * sq, hn] - query_layer = query_layer.permute([1, 2, 0, 3]).reshape( - output_size[0], output_size[1] * output_size[2], -1 - ) + if self.position_embedding_type.lower() == 'xpos': + query_layer = self.xpos(query_layer, offset=key_layer.shape[-2] - query_layer.shape[-2], downscale=False) + key_layer = self.xpos(key_layer, offset=0, downscale=True) - # [sk, b, 1, hn] -> [b, hn, sk] - key_layer = key_layer.squeeze(2).permute(1, 2, 0) + # ================================================== + # query_layer [sq, b, np, hn] + # key_layer [sk, b, np, hn] + # value_layer [sk, b, np, hn] + # attention_mask [b, 1, sq, sk] or [b, s] + # relative_position_bias [b, np, sq, sk] + # context_layer [b, np, sq, hn] + # ================================================== + context_layer = self.attn_fn(query_layer, key_layer, value_layer, attention_mask, relative_position_bias) - # preallocting input tensor: [b * np, sq, sk] - matmul_input_buffer = torch.empty( - output_size[0] * output_size[1], - output_size[2], - output_size[3], - dtype=query_layer.dtype, - device=torch.cuda.current_device(), - ) + if headscale_tensor is not None: + context_layer = context_layer * headscale_tensor - # Raw attention scores. [b * np, sq, sk] - matmul_result = torch.baddbmm( - matmul_input_buffer, - query_layer, # [b * np, sq, hn] - key_layer, # [b * np, hn, sk] - beta=0.0, - alpha=(1.0 / self.norm_factor), - ) - else: - # [sq, b, np, hn] -> [sq, b * np, hn] - query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1) - # [sk, b, np, hn] -> [sk, b * np, hn] - key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) - - # preallocting input tensor: [b * np, sq, sk] - matmul_input_buffer = torch.empty( - output_size[0] * output_size[1], - output_size[2], - output_size[3], - dtype=query_layer.dtype, - device=torch.cuda.current_device(), - ) + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() - # Raw attention scores. [b * np, sq, sk] - matmul_result = torch.baddbmm( - matmul_input_buffer, - query_layer.transpose(0, 1), # [b * np, sq, hn] - key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] - beta=0.0, - alpha=(1.0 / self.norm_factor) if self.normalize_attention_scores else 1.0, - ) + # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.view(*new_context_layer_shape) - # change view to [b, np, sq, sk] - attention_scores = matmul_result.view(*output_size) + return context_layer - if relative_position_bias is not None: - attention_scores += relative_position_bias[ - :, - self.num_attention_heads_partition_offset : self.num_attention_heads_partition_offset - + self.num_attention_heads_per_partition, - : attention_scores.size(2), - : attention_scores.size(3), - ] + def torch_attention(self, query_layer, key_layer, value_layer, attention_mask, attention_bias): + sq, b, np, hn = query_layer.shape + sk = key_layer.shape[0] - # ================================================== - # Update attention mask for inference. [b, np, sq, sk] - # ================================================== + if self.multi_query_attention: + query_layer = rearrange(query_layer, 'sq b np hn -> b (np sq) hn') + key_layer = rearrange(key_layer, 'sk b 1 hn -> b hn sk') + value_layer = rearrange(value_layer, 'sv b np hn -> (b np) sv hn') + else: + query_layer = rearrange(query_layer, 'sq b np hn -> (b np) sq hn') + key_layer = rearrange(key_layer, 'sk b np hn -> (b np) hn sk') + value_layer = rearrange(value_layer, 'sv b np hn -> (b np) sv hn') + + matmul_input_buffer = torch.empty( + query_layer.shape[0], + query_layer.shape[1], + key_layer.shape[2], + dtype=query_layer.dtype, + device=query_layer.device, + ) - if get_key_value: - with torch.no_grad(): - if layer_past is not None: - attention_mask = attention_mask[ - ..., attention_scores.size(3) - 1, : attention_scores.size(3) - ].unsqueeze(2) - else: - attention_mask = attention_mask[..., : attention_scores.size(3), : attention_scores.size(3)] + matmul_result = torch.baddbmm( + matmul_input_buffer, + query_layer, + key_layer, + beta=0.0, + alpha=(1.0 / self.norm_factor) if self.normalize_attention_scores else 1.0, + ) + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(b, np, sq, sk) - # =========================== - # Attention probs and dropout - # =========================== + if attention_bias is not None: + attention_scores += attention_bias - # attention scores and attention mask [b, np, sq, sk] attention_probs = self.scale_mask_softmax(attention_scores, attention_mask) # This is actually dropping out entire tokens to attend to, which might @@ -897,36 +938,111 @@ def forward( else: attention_probs = self.attention_dropout(attention_probs) - # ========================= - # Context layer. [sq, b, hp] - # ========================= + # change view [b * np, sq, sk] + attention_probs = rearrange(attention_probs, 'b np sq sk -> (b np) sq sk') - # value_layer -> context layer. - # [sk, b, np, hn] --> [b, np, sq, hn] + # matmul: [b * np, sq, hn] + context_layer = torch.bmm(attention_probs, value_layer) - # context layer shape: [b, np, sq, hn] - output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3)) + # change view [b, np, sq, hn] + context_layer = rearrange(context_layer, '(b np) sq hn -> b np sq hn', np=np) - # change view [sk, b * np, hn] - value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1) + return context_layer - # change view [b * np, sq, sk] - attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) + def flash_attention(self, query_layer, key_layer, value_layer, attention_mask, attention_bias): + query_layer = rearrange(query_layer, 'sq b np hn -> b sq np hn') + key_layer = rearrange(key_layer, 'sk b np hn -> b sk np hn') + value_layer = rearrange(value_layer, 'sv b np hn -> b sv np hn') - # matmul: [b * np, sq, hn] - context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) + # Use to ensure dtype cast to fp16 or bf16 + query_layer = _cast_if_autocast_enabled(query_layer) + key_layer = _cast_if_autocast_enabled(key_layer) + value_layer = _cast_if_autocast_enabled(value_layer) + attention_mask = _cast_if_autocast_enabled(attention_mask) + attention_bias = _cast_if_autocast_enabled(attention_bias) - # change view [b, np, sq, hn] - context_layer = context_layer.view(*output_size) + if attention_bias is not None: + return self.flash_attention_triton(query_layer, key_layer, value_layer, attention_mask, attention_bias,) + else: + return self.flash_attention_cuda(query_layer, key_layer, value_layer, attention_mask,) + + def reset_is_causal(self, query_length, key_length, causal): + if query_length != key_length: + if query_length == 1: + return False + raise NotImplementedError( + "Flash attention does not support query and key with different number of tokens, unless number of query tokens is 1." + ) + return causal + + def flash_attention_cuda(self, query_layer, key_layer, value_layer, attention_mask): + batch_size, seqlen, nheads, _ = query_layer.shape + + # True: attend / False: not attend + if attention_mask is None: + attention_mask_q = torch.ones(batch_size, query_layer.shape[1], device=query_layer.device).bool() + attention_mask_kv = torch.ones(batch_size, key_layer.shape[1], device=query_layer.device).bool() + elif len(attention_mask.shape) == 4: + # [b, 1, sq, sk] -> [b, sq] / [b, sk] + attention_mask_q = torch.any(torch.eq(attention_mask, False), dim=3).squeeze(1) + attention_mask_kv = torch.any(torch.eq(attention_mask, False), dim=2).squeeze(1) + else: + assert len(attention_mask.shape) == 2 + attention_mask_q = attention_mask + attention_mask_kv = attention_mask + + q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(query_layer, attention_mask_q) + k, _, cu_seqlens_k, max_seqlen_k = unpad_input(key_layer, attention_mask_kv) + v, _, _, _ = unpad_input(value_layer, attention_mask_kv) + causal = self.reset_is_causal( + query_layer.shape[1], key_layer.shape[1], self.attn_mask_type == AttnMaskType.causal + ) + context_layer = flash_attn_unpadded_func( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p=self.attention_dropout_p if self.training else 0.0, + causal=causal, + ) - if headscale_tensor is not None: - context_layer = context_layer * headscale_tensor + # [b, sq, np, hn] + context_layer = pad_input(context_layer, indices_q, batch_size, seqlen) - # [b, np, sq, hn] --> [sq, b, np, hn] - context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + # [b, sq, np, hn] -> [b, np, sq, hn] + context_layer = context_layer.permute(0, 2, 1, 3) + return context_layer - # [sq, b, np, hn] --> [sq, b, hp] - new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) - context_layer = context_layer.view(*new_context_layer_shape) + def flash_attention_triton(self, query_layer, key_layer, value_layer, attention_mask, attention_bias): + if self.attention_dropout_p > 0.0: + raise NotImplementedError(f'attention_dropout not implemented for flash_attention with attention bias') + + if attention_mask is not None: + if len(attention_mask.shape) == 4: + # [b, 1, sq, sk] -> [b, 1, sq, 1] / [b, 1, 1, sk] + attention_mask_q = torch.any(torch.eq(attention_mask, False), dim=3).unsqueeze(3) + attention_mask_kv = torch.any(torch.eq(attention_mask, False), dim=2).unsqueeze(2) + else: + # [b, s] -> [b, 1, s, 1] / [b, 1, 1, s] + assert len(attention_mask.shape) == 2 + attention_mask_q = attention_mask.unsqueeze(1).unsqueeze(3) + attention_mask_kv = attention_mask.unsqueeze(1).unsqueeze(2) + + attention_bias = attention_bias.masked_fill(~attention_mask_q, torch.finfo(query_layer.dtype).min) + attention_bias = attention_bias.masked_fill(~attention_mask_kv, torch.finfo(query_layer.dtype).min) + + causal = self.reset_is_causal( + query_layer.shape[1], key_layer.shape[1], self.attn_mask_type == AttnMaskType.causal + ) + context_layer = flash_attn_func(query_layer, key_layer, value_layer, attention_bias, causal) + + # [b, sq, np, hn] -> [b, np, sq, hn] + context_layer = context_layer.permute(0, 2, 1, 3) + + if attention_mask is not None: + context_layer = context_layer * attention_mask_q return context_layer diff --git a/nemo/collections/nlp/modules/common/megatron/language_model.py b/nemo/collections/nlp/modules/common/megatron/language_model.py index b8b12cf0caec..2d10576dc7d0 100755 --- a/nemo/collections/nlp/modules/common/megatron/language_model.py +++ b/nemo/collections/nlp/modules/common/megatron/language_model.py @@ -21,7 +21,12 @@ ) from nemo.collections.nlp.modules.common.megatron.layer_type import LayerType from nemo.collections.nlp.modules.common.megatron.module import MegatronModule -from nemo.collections.nlp.modules.common.megatron.rotary_pos_embedding import RotaryEmbedding +from nemo.collections.nlp.modules.common.megatron.position_embedding import ( + ALiBiRelativePositionEmbedding, + KERPLERelativePositionEmbedding, + RotaryEmbedding, + SandwichRelativePositionEmbedding, +) from nemo.collections.nlp.modules.common.megatron.transformer import ParallelTransformer from nemo.collections.nlp.modules.common.megatron.utils import ( ApexGuardDefaults, @@ -116,6 +121,7 @@ def get_language_model( fp8_amax_compute_algo='most_recent', reduce_amax=True, use_emha=False, + use_flash_attention=False, ): """Build language model and return along with the key to save.""" @@ -191,6 +197,7 @@ def get_language_model( fp8_amax_compute_algo=fp8_amax_compute_algo, reduce_amax=reduce_amax, use_emha=use_emha, + use_flash_attention=use_flash_attention, ) # key used for checkpoints. language_model_key = 'language_model' @@ -497,6 +504,7 @@ def __init__( fp8_amax_compute_algo='most_recent', reduce_amax=True, use_emha=False, + use_flash_attention=False, ): super(TransformerLanguageModel, self).__init__(share_token_embeddings=share_embeddings_and_output_weights) @@ -518,7 +526,6 @@ def __init__( self.share_embeddings_and_output_weights = share_embeddings_and_output_weights self.sequence_parallel = sequence_parallel self.dtype = utils_funcs.dtype_from_precision(precision, megatron_amp_O2) - if kv_channels is None: assert ( @@ -551,6 +558,40 @@ def __init__( rotary_dim = int(rotary_dim * rotary_percentage) self.rotary_pos_emb = RotaryEmbedding(rotary_dim) + elif position_embedding_type == 'alibi': + # TODO: If this is used for encoder-decodemax_position_embeddingsr model, implement proper logic and following + # addition for decoder. Currently it is only used for decoder model only. + # Encoder-decoder model, such as T5 is implemented in token_level_encoder_decoder.py + self.encoder_relative_position_embedding = ALiBiRelativePositionEmbedding( + bidirectional=encoder_attn_mask_type != AttnMaskType.causal, + num_attention_heads=num_attention_heads, + layer_type=LayerType.encoder, + num_attention_heads_alibi=None, + max_seq_len=max_position_embeddings, + ) + + elif position_embedding_type == 'kerple': + # TODO: If this is used for encoder-decodemax_position_embeddingsr model, implement proper logic and following + # addition for decoder. Currently it is only used for decoder model only. + # Encoder-decoder model, such as T5 is implemented in token_level_encoder_decoder.py + self.encoder_relative_position_embedding = KERPLERelativePositionEmbedding( + bidirectional=encoder_attn_mask_type != AttnMaskType.causal, + num_attention_heads=num_attention_heads, + layer_type=LayerType.encoder, + num_attention_heads_kerple=None, + max_seq_len=max_position_embeddings, + ) + assert use_flash_attention == False # flash-attention not supported with kerple at this point + + elif position_embedding_type == 'sandwich': + self.encoder_relative_position_embedding = SandwichRelativePositionEmbedding( + bidirectional=encoder_attn_mask_type != AttnMaskType.causal, + num_attention_heads=num_attention_heads, + layer_type=LayerType.encoder, + hidden_size=self.hidden_size // num_attention_heads if kv_channels is None else kv_channels, + max_seq_len=max_position_embeddings, + ) + # Transformer. self.encoder = ParallelTransformer( init_method=self.init_method, @@ -602,6 +643,8 @@ def __init__( fp8_amax_compute_algo=fp8_amax_compute_algo, reduce_amax=reduce_amax, use_emha=use_emha, + position_embedding_type=position_embedding_type, + use_flash_attention=use_flash_attention, ) self._encoder_key = 'encoder' @@ -642,6 +685,8 @@ def __init__( activations_checkpoint_granularity=activations_checkpoint_granularity, activations_checkpoint_layers_per_pipeline=activations_checkpoint_layers_per_pipeline, transformer_engine=transformer_engine, + position_embedding_type=position_embedding_type, + use_flash_attention=use_flash_attention, ) self._decoder_key = 'decoder' @@ -713,26 +758,35 @@ def forward( pass # enc_attn_mask: [1, 1, s, s] - - if self.position_embedding_type == 'rope': - if inference_max_sequence_len is not None: - rotary_pos_emb = self.rotary_pos_emb(inference_max_sequence_len) - elif self.encoder.input_tensor is not None: - if self.sequence_parallel: - rotary_pos_emb = self.rotary_pos_emb( - self.encoder.input_tensor.size(0) * parallel_state.get_tensor_model_parallel_world_size() - ) - else: - rotary_pos_emb = self.rotary_pos_emb(self.encoder.input_tensor.size(0)) + if inference_max_sequence_len is not None: + enc_seq_length = inference_max_sequence_len + elif self.encoder.input_tensor is not None: + if self.sequence_parallel: + enc_seq_length = ( + self.encoder.input_tensor.size(0) * parallel_state.get_tensor_model_parallel_world_size() + ) else: - if self.sequence_parallel: - rotary_pos_emb = self.rotary_pos_emb( - encoder_input.size(0) * parallel_state.get_tensor_model_parallel_world_size() - ) - else: - rotary_pos_emb = self.rotary_pos_emb(encoder_input.size(0)) + enc_seq_length = self.encoder.input_tensor.size(0) else: - rotary_pos_emb = None + if self.sequence_parallel: + enc_seq_length = encoder_input.size(0) * parallel_state.get_tensor_model_parallel_world_size() + else: + enc_seq_length = encoder_input.size(0) + + rotary_pos_emb = None + encoder_self_attention_relative_position_bias = None + if self.position_embedding_type == 'rope': + rotary_pos_emb = self.rotary_pos_emb(enc_seq_length) + elif ( + self.position_embedding_type == 'alibi' + or self.position_embedding_type == 'sandwich' + or self.position_embedding_type == 'kerple' + ): + encoder_self_attention_relative_position_bias = self.encoder_relative_position_embedding( + query_seq_length=enc_seq_length, key_seq_length=enc_seq_length, + ) + # causal attention bias: [1, head, 1, k] + # non-causal attention bias: [1, head, q, k] # encoder. if enc_hidden_states is None: @@ -747,6 +801,7 @@ def forward( rotary_pos_emb=(rotary_pos_emb, None, None) if rotary_pos_emb is not None else None, # This assumes that this being used as a GPT/BERT model only (no cross-attention) + self_attention_relative_position_bias=encoder_self_attention_relative_position_bias, ) else: encoder_output = enc_hidden_states.to(encoder_input.dtype) diff --git a/nemo/collections/nlp/modules/common/megatron/layer_norm_1p.py b/nemo/collections/nlp/modules/common/megatron/layer_norm_1p.py index ca59bcc8850a..4a94b37aae7b 100644 --- a/nemo/collections/nlp/modules/common/megatron/layer_norm_1p.py +++ b/nemo/collections/nlp/modules/common/megatron/layer_norm_1p.py @@ -13,7 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from torch import nn +import torch +from nemo.collections.nlp.modules.common.megatron.utils import _cast_if_autocast_enabled try: from apex.contrib.layer_norm.layer_norm import FastLayerNorm as OrigFastLayerNorm @@ -35,8 +36,8 @@ def __init__(self, *args, **kwargs): ), 'LayerNorm1P implemented only as an apex.contrib.layer_norm.FastLayerNorm extension' def reset_parameters(self): - nn.init.zeros_(self.weight) - nn.init.zeros_(self.bias) + torch.nn.init.zeros_(self.weight) + torch.nn.init.zeros_(self.bias) def forward(self, x): return _fast_layer_norm(x, self.weight + 1, self.bias, self.epsilon) @@ -44,6 +45,27 @@ def forward(self, x): else: - class LayerNorm1P(nn.Module): + class LayerNorm1P(torch.nn.Module): def __init__(self, *args, **kwargs): raise NotImplementedError('LayerNorm1P available only with apex installed') + + +class LPLayerNorm(torch.nn.LayerNorm): + def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None): + super().__init__( + normalized_shape=normalized_shape, + eps=eps, + elementwise_affine=elementwise_affine, + device=device, + dtype=dtype, + ) + + def forward(self, x): + module_device = x.device + downcast_x = _cast_if_autocast_enabled(x) + downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight + downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias + with torch.autocast(enabled=False, device_type=module_device.type): + return torch.nn.functional.layer_norm( + downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps + ) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py index 28eb39e630fc..ca2000842fe4 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py @@ -88,6 +88,8 @@ def get_decoder_model( moe_dropout=0.0, turn_off_rop=False, # turn off the RoP positional embedding version=1, + position_embedding_type='learned_absolute', + use_flash_attention=False, ): """Build language model and return along with the key to save.""" @@ -145,6 +147,8 @@ def get_decoder_model( num_moe_experts=num_moe_experts, moe_frequency=moe_frequency, moe_dropout=moe_dropout, + position_embedding_type=position_embedding_type, + use_flash_attention=use_flash_attention, ) elif arch == "retro": decoder = MegatronRetrievalTransformerDecoderModule( diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py index 4005ffbd879e..9f5d917e2077 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py @@ -90,6 +90,8 @@ def get_encoder_model( moe_dropout=0.0, turn_off_rop=False, # turn off the RoP positional embedding version=1, # model version + position_embedding_type='learned_absolute', + use_flash_attention=False, ): """Build language model and return along with the key to save.""" @@ -147,6 +149,8 @@ def get_encoder_model( num_moe_experts=num_moe_experts, moe_frequency=moe_frequency, moe_dropout=moe_dropout, + position_embedding_type=position_embedding_type, + use_flash_attention=use_flash_attention, ) elif arch == "retro": encoder = MegatronRetrievalTransformerEncoderModule( diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py index c3cb1fd05c3b..f2c42597eb83 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py @@ -85,6 +85,8 @@ def __init__( num_moe_experts=1, moe_frequency=1, moe_dropout=0.0, + position_embedding_type='learned_absolute', + use_flash_attention=False, ): super(MegatronTransformerDecoderModule, self).__init__() @@ -149,6 +151,8 @@ def __init__( num_moe_experts=num_moe_experts, moe_frequency=moe_frequency, moe_dropout=moe_dropout, + position_embedding_type=position_embedding_type, + use_flash_attention=use_flash_attention, ) self._model_key = 'model' diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py index 2eacf8aad672..60c347338105 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py @@ -82,6 +82,8 @@ def __init__( num_moe_experts=1, moe_frequency=1, moe_dropout=0.0, + position_embedding_type='learned_absolute', + use_flash_attention=False, ): super(MegatronTransformerEncoderModule, self).__init__() @@ -96,6 +98,7 @@ def __init__( self.parent_model_type = parent_model_type self.normalization = normalization self.transformer_block_type = transformer_block_type + self.use_flash_attention = use_flash_attention if kv_channels is None: @@ -147,6 +150,8 @@ def __init__( num_moe_experts=num_moe_experts, moe_frequency=moe_frequency, moe_dropout=moe_dropout, + position_embedding_type=position_embedding_type, + use_flash_attention=use_flash_attention, ) self._model_key = 'model' @@ -163,9 +168,12 @@ def forward( enc_self_attention_relative_position_bias=None, ): # convert to Megatron mask - enc_attn_mask_3d = build_attention_mask_3d( - source_mask=enc_attn_mask, target_mask=enc_attn_mask, attn_mask_type=self.model_attn_mask_type, - ) + if self.use_flash_attention: + enc_attn_mask_3d = enc_attn_mask < 0.5 + else: + enc_attn_mask_3d = build_attention_mask_3d( + source_mask=enc_attn_mask, target_mask=enc_attn_mask, attn_mask_type=self.model_attn_mask_type, + ) # transformer encoder enc_output = self.model( diff --git a/nemo/collections/nlp/modules/common/megatron/position_embedding/__init__.py b/nemo/collections/nlp/modules/common/megatron/position_embedding/__init__.py new file mode 100644 index 000000000000..fdbbed86cb2c --- /dev/null +++ b/nemo/collections/nlp/modules/common/megatron/position_embedding/__init__.py @@ -0,0 +1,31 @@ +# coding=utf-8 +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo.collections.nlp.modules.common.megatron.position_embedding.alibi_relative_position_embedding import ( + ALiBiRelativePositionEmbedding, +) +from nemo.collections.nlp.modules.common.megatron.position_embedding.kerple_relative_position_embedding import ( + KERPLERelativePositionEmbedding, +) +from nemo.collections.nlp.modules.common.megatron.position_embedding.rotary_position_embedding import RotaryEmbedding +from nemo.collections.nlp.modules.common.megatron.position_embedding.sandwich_relative_position_embedding import ( + SandwichRelativePositionEmbedding, +) +from nemo.collections.nlp.modules.common.megatron.position_embedding.t5_relative_position_embedding import ( + T5RelativePositionEmbedding, +) +from nemo.collections.nlp.modules.common.megatron.position_embedding.xpos_position_embedding import ( + XPOSPositionEmbedding, +) diff --git a/nemo/collections/nlp/modules/common/megatron/alibi_relative_position_embedding.py b/nemo/collections/nlp/modules/common/megatron/position_embedding/alibi_relative_position_embedding.py similarity index 73% rename from nemo/collections/nlp/modules/common/megatron/alibi_relative_position_embedding.py rename to nemo/collections/nlp/modules/common/megatron/position_embedding/alibi_relative_position_embedding.py index 4f5abd96743b..6425e288f277 100644 --- a/nemo/collections/nlp/modules/common/megatron/alibi_relative_position_embedding.py +++ b/nemo/collections/nlp/modules/common/megatron/position_embedding/alibi_relative_position_embedding.py @@ -42,20 +42,31 @@ def build_slopes(num_attention_heads, num_attention_heads_alibi): """ Builds a slopes tensor. """ - slopes = torch.Tensor( - get_slopes(num_attention_heads_alibi) + [0] * (num_attention_heads - num_attention_heads_alibi) - ).cuda() - return slopes.unsqueeze(-1).unsqueeze(-1) + slopes = ( + torch.Tensor(get_slopes(num_attention_heads_alibi) + [0] * (num_attention_heads - num_attention_heads_alibi)) + .unsqueeze(-1) + .unsqueeze(-1) + ) + if torch.cuda.is_available(): + slopes = slopes.to(torch.cuda.current_device()) -def build_relative_position(query_length, key_length, num_attention_heads): - context_position = torch.arange(query_length)[:, None].cuda() - memory_position = torch.arange(key_length)[None, :].cuda() - # shape (query_length, key_length, num_heads) - relative_position = memory_position - context_position + return slopes + + +def build_relative_position(max_seq_len, full=True): + """ + full=True: shape (max_seq_len, max_seq_len) + full=False: shape (max_seq_len) + """ + relative_position = torch.arange(1 - max_seq_len, 1)[None, :].mul(-1) # (1, max_seq_len) + + if full: + memory_position = torch.arange(1 - max_seq_len, 1)[:, None].mul(-1) + relative_position = torch.abs(memory_position - relative_position) # (max_seq_len, max_seq_len) - # shape (num_attention_heads, max_seq_len, max_seq_len) - relative_position = torch.abs(relative_position).unsqueeze(0).expand(num_attention_heads, -1, -1) + if torch.cuda.is_available(): + relative_position = relative_position.to(torch.cuda.current_device()) return relative_position @@ -68,7 +79,7 @@ class ALiBiRelativePositionEmbedding(torch.nn.Module): """ def __init__( - self, bidirectional, num_attention_heads, layer_type, num_attention_heads_alibi=None, max_seq_len=512 + self, bidirectional, num_attention_heads, layer_type, num_attention_heads_alibi=None, max_seq_len=512, ): """ Args: @@ -101,20 +112,25 @@ def __init__( # cache the slopes self.slopes = build_slopes(num_attention_heads, num_attention_heads_alibi) # cache the relative position bias. shape (num_attention_heads, max_seq_len, max_seq_len) - self.relative_position = build_relative_position(max_seq_len, max_seq_len, num_attention_heads) + # if we use causal attention (not bidrectional), we can use singleton relative position + self.relative_position = ( + build_relative_position(max_seq_len, full=bidirectional).unsqueeze(0).expand(num_attention_heads, -1, -1) + ) def forward(self, query_seq_length, key_seq_length): # used cached relative position if possible max_seq_len = max(query_seq_length, key_seq_length) if max_seq_len > self.max_seq_len: - relative_position = build_relative_position(max_seq_len, max_seq_len, self.num_attention_heads) + relative_position = ( + build_relative_position(max_seq_len, full=self.bidirectional) + .unsqueeze(0) + .expand(self.num_attention_heads, -1, -1) + ) else: relative_position = self.relative_position # shape (num_attention_heads, query_seq_length, key_seq_length) - relative_position = relative_position[:, :query_seq_length, :key_seq_length] + relative_position = relative_position[:, -query_seq_length:, -key_seq_length:] # if not bidirectional, mask out the future positions - if not self.bidirectional: - relative_position = torch.tril(relative_position) # shape (1, num_heads, query_length, key_length) return -relative_position.unsqueeze(0) * self.slopes diff --git a/nemo/collections/nlp/modules/common/megatron/kerple_relative_position_embedding.py b/nemo/collections/nlp/modules/common/megatron/position_embedding/kerple_relative_position_embedding.py similarity index 81% rename from nemo/collections/nlp/modules/common/megatron/kerple_relative_position_embedding.py rename to nemo/collections/nlp/modules/common/megatron/position_embedding/kerple_relative_position_embedding.py index 54276d6fa21e..fc0c837da556 100644 --- a/nemo/collections/nlp/modules/common/megatron/kerple_relative_position_embedding.py +++ b/nemo/collections/nlp/modules/common/megatron/position_embedding/kerple_relative_position_embedding.py @@ -17,7 +17,7 @@ import torch -from nemo.collections.nlp.modules.common.megatron.alibi_relative_position_embedding import ( +from nemo.collections.nlp.modules.common.megatron.position_embedding.alibi_relative_position_embedding import ( build_relative_position, build_slopes, ) @@ -33,7 +33,7 @@ class KERPLERelativePositionEmbedding(torch.nn.Module): """ def __init__( - self, bidirectional, num_attention_heads, layer_type, num_attention_heads_kerple=None, max_seq_len=512 + self, bidirectional, num_attention_heads, layer_type, num_attention_heads_kerple=None, max_seq_len=512, ): """ Args: @@ -65,21 +65,26 @@ def __init__( # initialize the slopes self.kerple_b = torch.nn.Parameter(build_slopes(num_attention_heads, num_attention_heads_kerple)) - self.kerple_a = torch.zeros_like(self.kerple_b) - self.kerple_p = torch.ones_like(self.kerple_b) + self.kerple_a = torch.nn.Parameter(torch.ones_like(self.kerple_b)) + self.kerple_p = torch.nn.Parameter(torch.ones_like(self.kerple_b)) # cache the relative position bias. shape (num_attention_heads, max_seq_len, max_seq_len) - self.relative_position = build_relative_position(max_seq_len, max_seq_len, num_attention_heads) + # if we use causal attention (not bidrectional), we can use singleton relative position + self.relative_position = ( + build_relative_position(max_seq_len, full=True).unsqueeze(0).expand(num_attention_heads, -1, -1) + ) def forward(self, query_seq_length, key_seq_length): # used cached relative position if possible max_seq_len = max(query_seq_length, key_seq_length) if max_seq_len > self.max_seq_len: - relative_position = build_relative_position(max_seq_len, max_seq_len, self.num_attention_heads) + relative_position = ( + build_relative_position(max_seq_len, full=True).unsqueeze(0).expand(self.num_attention_heads, -1, -1) + ) else: relative_position = self.relative_position # shape (num_attention_heads, query_seq_length, key_seq_length) - relative_position = relative_position[:, :query_seq_length, :key_seq_length] + relative_position = relative_position[:, -query_seq_length:, -key_seq_length:] # if not bidirectional, mask out the future positions if not self.bidirectional: relative_position = torch.tril(relative_position) diff --git a/nemo/collections/nlp/modules/common/megatron/rotary_pos_embedding.py b/nemo/collections/nlp/modules/common/megatron/position_embedding/rotary_position_embedding.py similarity index 96% rename from nemo/collections/nlp/modules/common/megatron/rotary_pos_embedding.py rename to nemo/collections/nlp/modules/common/megatron/position_embedding/rotary_position_embedding.py index 191601054ef8..5a8d6d7dd333 100644 --- a/nemo/collections/nlp/modules/common/megatron/rotary_pos_embedding.py +++ b/nemo/collections/nlp/modules/common/megatron/position_embedding/rotary_position_embedding.py @@ -38,7 +38,8 @@ def forward(self, max_seq_len, offset=0): def _rotate_half(x): """ - change sign so the last dimension becomes [-odd, +even] + change sign so the last dimension + [A, B, C, D] -> [-C, -D, A, B] """ x = rearrange(x, '... (j d) -> ... j d', j=2) x1, x2 = x.unbind(dim=-2) diff --git a/nemo/collections/nlp/modules/common/megatron/position_embedding/sandwich_relative_position_embedding.py b/nemo/collections/nlp/modules/common/megatron/position_embedding/sandwich_relative_position_embedding.py new file mode 100644 index 000000000000..0e2dfd7d2ef6 --- /dev/null +++ b/nemo/collections/nlp/modules/common/megatron/position_embedding/sandwich_relative_position_embedding.py @@ -0,0 +1,75 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from nemo.collections.nlp.modules.common.megatron.position_embedding.alibi_relative_position_embedding import ( + build_relative_position, +) +from nemo.utils.decorators import experimental + +__all__ = ['SandwichRelativePositionEmbedding'] + + +@experimental +class SandwichRelativePositionEmbedding(torch.nn.Module): + """ + Dissecting Transformer Length Extrapolation via the Lens of Receptive Field Analysis + Based on https://arxiv.org/abs/2212.10356 + """ + + def __init__( + self, bidirectional, num_attention_heads, layer_type, hidden_size, max_seq_len=512, + ): + """ + Args: + num_attention_heads: Number of attention heads + hidden_size: Hidden size per attention head + """ + super().__init__() + self.bidirectional = bidirectional + self.layer_type = layer_type + self.num_attention_heads = num_attention_heads + self.hidden_size = hidden_size + self.max_seq_len = max_seq_len + self.relative_position = build_relative_position(max_seq_len, full=True) + + def forward(self, query_seq_length, key_seq_length): + # used cached relative position if possible + max_seq_len = max(query_seq_length, key_seq_length) + if max_seq_len > self.max_seq_len: + relative_position = build_relative_position(max_seq_len, full=True) + else: + relative_position = self.relative_position + + # shape (query_seq_length, key_seq_length) + relative_position = relative_position[-query_seq_length:, -key_seq_length:] + # if not bidirectional, mask out the future positions + if not self.bidirectional: + relative_position = torch.tril(relative_position) + + inv_freq = 1.0 / ( + 10000 + ** (2 * torch.arange(1, self.hidden_size / 2 + 1, device=relative_position.device) / self.hidden_size) + ) + + _bias = torch.sum((relative_position[:, :, None].repeat(1, 1, len(inv_freq)) * inv_freq).cos(), axis=2) + bias = _bias.repeat(self.num_attention_heads, 1, 1) + + _bias_scales = torch.arange(1, self.num_attention_heads + 1, 1, device=relative_position.device) + bias_scales = _bias_scales[:, None, None] + + scaled_bias = (bias - self.hidden_size / 2) / (bias_scales * 8 / self.num_attention_heads).unsqueeze(0) + + return scaled_bias diff --git a/nemo/collections/nlp/modules/common/megatron/t5_relative_position_embedding.py b/nemo/collections/nlp/modules/common/megatron/position_embedding/t5_relative_position_embedding.py similarity index 95% rename from nemo/collections/nlp/modules/common/megatron/t5_relative_position_embedding.py rename to nemo/collections/nlp/modules/common/megatron/position_embedding/t5_relative_position_embedding.py index c2a0c8661acf..4566d9aa7876 100644 --- a/nemo/collections/nlp/modules/common/megatron/t5_relative_position_embedding.py +++ b/nemo/collections/nlp/modules/common/megatron/position_embedding/t5_relative_position_embedding.py @@ -43,9 +43,7 @@ def __init__( # Relative position Embedding # Relative Position embedding (all attention layers). - self.relative_position_embedding = torch.nn.Embedding( - self.relative_position_num_buckets, num_attention_heads - ).to(torch.cuda.current_device()) + self.relative_position_embedding = torch.nn.Embedding(self.relative_position_num_buckets, num_attention_heads) self._relative_position_embedding_key = 'relative_position_embedding' init_method(self.relative_position_embedding.weight) @@ -104,8 +102,9 @@ def _compute_relative_position_bucket(self, query_length, key_length): """ """Compute binned relative position bias""" - context_position = torch.arange(query_length, dtype=torch.long, device=torch.cuda.current_device())[:, None] - memory_position = torch.arange(key_length, dtype=torch.long, device=torch.cuda.current_device())[None, :] + device = self.relative_position_embedding.weight.device + context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] relative_position = memory_position - context_position # shape (query_length, key_length) relative_position_bucket_tensor = self._relative_position_bucket( diff --git a/nemo/collections/nlp/modules/common/megatron/position_embedding/xpos_position_embedding.py b/nemo/collections/nlp/modules/common/megatron/position_embedding/xpos_position_embedding.py new file mode 100644 index 000000000000..ef59234790c5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/megatron/position_embedding/xpos_position_embedding.py @@ -0,0 +1,78 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +from einops import rearrange +from nemo.utils.decorators import experimental + + +def fixed_pos_embedding(x): + seq_len, dim = x.shape + inv_freq = 1.0 / (10000 ** (torch.arange(0, dim) / dim)) + sinusoid_inp = torch.einsum("i , j -> i j", torch.arange(0, seq_len, dtype=torch.float), inv_freq).to(x) + return torch.sin(sinusoid_inp), torch.cos(sinusoid_inp) + + +def rotate_every_two(x): + x1 = x[:, :, ::2] + x2 = x[:, :, 1::2] + x = torch.stack((-x2, x1), dim=-1) + return x.flatten(-2) # in einsum notation: rearrange(x, '... d j -> ... (d j)')\ + + +def duplicate_interleave(m): + """ + A simple version of `torch.repeat_interleave` for duplicating a matrix while interleaving the copy. + """ + dim0 = m.shape[0] + m = m.view(-1, 1) # flatten the matrix + m = m.repeat(1, 2) # repeat all elements into the 2nd dimension + m = m.view(dim0, -1) # reshape into a matrix, interleaving the copy + return m + + +def apply_rotary_pos_emb(x, sin, cos, scale=1): + sin, cos = map(lambda t: duplicate_interleave(t * scale), (sin, cos)) + # einsum notation for lambda t: repeat(t[offset:x.shape[1]+offset,:], "n d -> () n () (d j)", j=2) + return (x * cos) + (rotate_every_two(x) * sin) + + +@experimental +class XPOSPositionEmbedding(nn.Module): + def __init__(self, head_dim, scale_base=2048): + super().__init__() + self.head_dim = head_dim + self.scale_base = scale_base + self.register_buffer("scale", (torch.arange(0, head_dim, 2) + 0.4 * head_dim) / (1.4 * head_dim)) + + def forward(self, x, offset=0, downscale=False): + length, b = x.shape[0], x.shape[1] + x = rearrange(x, 's b np hn -> (b np) s hn') + min_pos = -(length + offset) // 2 + max_pos = length + offset + min_pos + scale = self.scale ** torch.arange(min_pos, max_pos, 1).to(self.scale).div(self.scale_base)[:, None] + sin, cos = fixed_pos_embedding(scale) + + if scale.shape[0] > length: + scale = scale[-length:] + sin = sin[-length:] + cos = cos[-length:] + + if downscale: + scale = 1 / scale + + x = apply_rotary_pos_emb(x, sin, cos, scale) + x = rearrange(x, '(b np) s hn -> s b np hn', b=b) + return x diff --git a/nemo/collections/nlp/modules/common/megatron/retrieval_transformer.py b/nemo/collections/nlp/modules/common/megatron/retrieval_transformer.py index 73c41cee6c6f..83dea362c3e1 100644 --- a/nemo/collections/nlp/modules/common/megatron/retrieval_transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/retrieval_transformer.py @@ -19,7 +19,7 @@ from einops import rearrange, repeat from nemo.collections.nlp.modules.common.megatron.module import MegatronModule -from nemo.collections.nlp.modules.common.megatron.rotary_pos_embedding import RotaryEmbedding +from nemo.collections.nlp.modules.common.megatron.position_embedding import RotaryEmbedding from nemo.collections.nlp.modules.common.megatron.transformer import ParallelTransformer from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults, build_attention_mask_3d diff --git a/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py b/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py index 229a9af48048..fc16295020fb 100644 --- a/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py +++ b/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py @@ -15,12 +15,6 @@ import torch from omegaconf import DictConfig -from nemo.collections.nlp.modules.common.megatron.alibi_relative_position_embedding import ( - ALiBiRelativePositionEmbedding, -) -from nemo.collections.nlp.modules.common.megatron.kerple_relative_position_embedding import ( - KERPLERelativePositionEmbedding, -) from nemo.collections.nlp.modules.common.megatron.language_model import Embedding from nemo.collections.nlp.modules.common.megatron.layer_type import LayerType from nemo.collections.nlp.modules.common.megatron.megatron_decoders import get_decoder_model @@ -29,7 +23,11 @@ ) from nemo.collections.nlp.modules.common.megatron.megatron_encoders import get_encoder_model from nemo.collections.nlp.modules.common.megatron.module import MegatronModule -from nemo.collections.nlp.modules.common.megatron.t5_relative_position_embedding import T5RelativePositionEmbedding +from nemo.collections.nlp.modules.common.megatron.position_embedding import ( + ALiBiRelativePositionEmbedding, + KERPLERelativePositionEmbedding, + T5RelativePositionEmbedding, +) from nemo.collections.nlp.modules.common.megatron.utils import ( ApexGuardDefaults, build_position_ids, @@ -197,6 +195,11 @@ def __init__( else: self.encoder_relative_position_embedding = None + if encoder_cfg.get('use_flash_attention', False) and encoder_cfg.get( + 'position_embedding_type', 'learned_absolute' + ) in ['relative', 'kerple']: + raise ValueError('flash-attention not supported with relative or kerple at this point') + encoder = get_encoder_model( arch=encoder_cfg.arch, hidden_size=encoder_cfg.hidden_size, @@ -243,6 +246,8 @@ def __init__( num_moe_experts=encoder_cfg.get('num_moe_experts', 1), moe_frequency=encoder_cfg.get('moe_frequency', 1), moe_dropout=encoder_cfg.get('moe_dropout', 0.0), + position_embedding_type=encoder_cfg.get('position_embedding_type', 'learned_absolute'), + use_flash_attention=encoder_cfg.get('use_flash_attention', False), ) if add_decoder: @@ -307,6 +312,7 @@ def __init__( ): self.decoder_cross_attention_relative_position_embeddings_weight().data.fill_(0) self.decoder_cross_attention_relative_position_embeddings_weight().shared = True + elif self.decoder_cfg.get('position_embedding_type', 'learned_absolute') == 'alibi': self.decoder_relative_position_embedding = ALiBiRelativePositionEmbedding( bidirectional=False, @@ -328,6 +334,11 @@ def __init__( else: self.decoder_relative_position_embedding = None + if decoder_cfg.get('use_flash_attention', False) and decoder_cfg.get( + 'position_embedding_type', 'learned_absolute' + ) in ['relative', 'kerple']: + raise ValueError('flash-attention not supported with relative or kerple at this point') + decoder = get_decoder_model( arch=decoder_cfg.arch, hidden_size=decoder_cfg.hidden_size, @@ -373,6 +384,8 @@ def __init__( num_moe_experts=decoder_cfg.get('num_moe_experts', 1), moe_frequency=decoder_cfg.get('moe_frequency', 1), moe_dropout=decoder_cfg.get('moe_dropout', 0.0), + position_embedding_type=decoder_cfg.get('position_embedding_type', 'learned_absolute'), + use_flash_attention=decoder_cfg.get('use_flash_attention', False), ) self.enc_dec_model = MegatronTransformerEncoderDecoderModule( diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index f5dfbcabcd0e..8a0b22b4d289 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -18,6 +18,7 @@ from typing import Any, Callable, Optional import torch +import torch.nn as nn from einops import rearrange from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig @@ -33,7 +34,7 @@ dropout_add, ) from nemo.collections.nlp.modules.common.megatron.fused_layer_norm import get_layer_norm -from nemo.collections.nlp.modules.common.megatron.layer_norm_1p import LayerNorm1P +from nemo.collections.nlp.modules.common.megatron.layer_norm_1p import LayerNorm1P, LPLayerNorm from nemo.collections.nlp.modules.common.megatron.layer_type import LayerType from nemo.collections.nlp.modules.common.megatron.mlp import ParallelMLP, SwitchMLP from nemo.collections.nlp.modules.common.megatron.module import MegatronModule @@ -115,6 +116,12 @@ def _dropout_add(x, bias, residual, prob): return _dropout_add +def remove_bias_from_layernorm(layer): + for module in layer.modules(): + if hasattr(module, 'bias') and isinstance(module.bias, nn.Parameter): + module.register_parameter('bias', None) + + class ParallelTransformerLayer_(MegatronModule, adapter_mixins.AdapterModuleMixin): """A single transformer layer. @@ -164,6 +171,7 @@ def __init__( num_moe_experts=1, moe_frequency=1, moe_dropout=0.0, + use_flash_attention=False, ): super(ParallelTransformerLayer_, self).__init__() @@ -187,7 +195,9 @@ def __init__( 'bias_dropout_add_fusion=True requires bias=True, found bias=False. Either set both to True or both to False.' ) - if normalization not in ['layernorm', 'layernorm1p', 'rmsnorm']: + # the low_precision_layernorm does not require a bias term, whereas layernorm1p from apex + # does require a bias, so it cannot be used for bias-less low precision LN such as in MPT-7B + if normalization not in ['layernorm', 'layernorm1p', 'rmsnorm', 'low_precision_layernorm']: raise ValueError(f'normalization must be "layernorm", "layernorm1p" or "rmsnorm", found {normalization}') if transformer_block_type not in ['pre_ln', 'post_ln', 'normformer']: @@ -212,8 +222,16 @@ def __init__( self.input_layernorm = LayerNorm1P( hidden_size, layernorm_epsilon, sequence_parallel_enabled=sequence_parallel ) + elif normalization == 'low_precision_layernorm': + self.input_layernorm = LPLayerNorm(hidden_size, layernorm_epsilon) else: self.input_layernorm = MixedFusedRMSNorm(hidden_size, layernorm_epsilon) + # for architectures such as MPT, there is no bias term even on the layernorms + # this code allows us to remove the bias terms from the layernorm module + # so that we can support MPT. However, certain apex-based LNs don't support + # removing bias, so we also have to check for that + if not bias and normalization not in ['layernorm', 'layernorm1p']: + remove_bias_from_layernorm(self.input_layernorm) self.self_attention = ParallelAttention( init_method=init_method, @@ -240,6 +258,7 @@ def __init__( sequence_parallel=sequence_parallel, gradient_accumulation_fusion=gradient_accumulation_fusion, normalize_attention_scores=normalize_attention_scores, + use_flash_attention=use_flash_attention, ) if transformer_block_type == 'normformer': @@ -261,8 +280,12 @@ def __init__( self.post_attention_layernorm = LayerNorm1P( hidden_size, layernorm_epsilon, sequence_parallel_enabled=sequence_parallel ) + elif normalization == 'low_precision_layernorm': + self.post_attention_layernorm = LPLayerNorm(hidden_size, layernorm_epsilon) else: self.post_attention_layernorm = MixedFusedRMSNorm(hidden_size, layernorm_epsilon) + if not bias and normalization not in ['layernorm', 'layernorm1p']: + remove_bias_from_layernorm(self.post_attention_layernorm) if self.layer_type == LayerType.decoder_pre_mlp: # skip MLP and cross attention @@ -280,8 +303,12 @@ def __init__( self.post_attention_layernorm = LayerNorm1P( hidden_size, layernorm_epsilon, sequence_parallel_enabled=sequence_parallel ) + elif normalization == 'low_precision_layernorm': + self.post_attention_layernorm = LPLayerNorm(hidden_size, layernorm_epsilon) else: self.post_attention_layernorm = MixedFusedRMSNorm(hidden_size, layernorm_epsilon) + if not bias and normalization not in ['layernorm', 'layernorm1p']: + remove_bias_from_layernorm(self.post_attention_layernorm) if self.layer_type == LayerType.decoder or self.layer_type == LayerType.retrieval_encoder: self.inter_attention = ParallelAttention( @@ -669,6 +696,7 @@ def __init__( num_moe_experts=1, moe_frequency=1, moe_dropout=0.0, + use_flash_attention=False, ): super(ParallelTransformerLayer, self).__init__( init_method=init_method, @@ -711,6 +739,7 @@ def __init__( num_moe_experts=num_moe_experts, moe_frequency=moe_frequency, moe_dropout=moe_dropout, + use_flash_attention=use_flash_attention, ) # Dtype for forward pass - ignore amp O2 @@ -924,6 +953,7 @@ def __init__( num_moe_experts=1, moe_frequency=1, moe_dropout=0.0, + use_flash_attention=False, ): super(ParallelTransformer, self).__init__() @@ -1104,6 +1134,7 @@ def build_layer(layer_number): num_moe_experts=num_moe_experts, moe_frequency=moe_frequency, moe_dropout=moe_dropout, + use_flash_attention=use_flash_attention, ) if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: @@ -1154,8 +1185,16 @@ def build_layer(layer_number): self.final_layernorm = LayerNorm1P( hidden_size, layernorm_epsilon, sequence_parallel_enabled=sequence_parallel ) + elif normalization == 'low_precision_layernorm': + self.final_layernorm = LPLayerNorm(hidden_size, layernorm_epsilon) else: self.final_layernorm = MixedFusedRMSNorm(hidden_size, layernorm_epsilon) + # for architectures such as MPT, there is no bias term even on the layernorms + # this code allows us to remove the bias terms from the layernorm module + # so that we can support MPT. However, certain apex-based LNs don't support + # removing bias, so we also have to check for that + if not bias and normalization not in ['layernorm', 'layernorm1p']: + remove_bias_from_layernorm(self.final_layernorm) def _get_layer(self, layer_number): return self.layers[layer_number] diff --git a/nemo/collections/nlp/modules/common/megatron/utils.py b/nemo/collections/nlp/modules/common/megatron/utils.py index 8ef46c10d49b..7c7a428fa43f 100644 --- a/nemo/collections/nlp/modules/common/megatron/utils.py +++ b/nemo/collections/nlp/modules/common/megatron/utils.py @@ -179,7 +179,9 @@ def average_losses_across_data_parallel_group(losses): return averaged_losses -def get_ltor_masks_and_position_ids(data, eod_token, reset_position_ids, reset_attention_mask, eod_mask_loss): +def get_ltor_masks_and_position_ids( + data, eod_token, reset_position_ids, reset_attention_mask, eod_mask_loss, compute_attention_mask=True +): """Build masks and position id for left to right model.""" # Extract batch size and sequence length. @@ -190,9 +192,12 @@ def get_ltor_masks_and_position_ids(data, eod_token, reset_position_ids, reset_a att_mask_batch = micro_batch_size else: att_mask_batch = 1 - attention_mask = torch.tril(torch.ones((att_mask_batch, seq_length, seq_length), device=data.device)).view( - att_mask_batch, 1, seq_length, seq_length - ) + + attention_mask = None + if compute_attention_mask: + attention_mask = torch.tril(torch.ones((att_mask_batch, seq_length, seq_length), device=data.device)).view( + att_mask_batch, 1, seq_length, seq_length + ) # Loss mask. loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device) @@ -228,8 +233,9 @@ def get_ltor_masks_and_position_ids(data, eod_token, reset_position_ids, reset_a position_ids[b, (i + 1) :] -= i + 1 - prev_index prev_index = i + 1 - # Convert attention mask to binary: - attention_mask = attention_mask < 0.5 + if compute_attention_mask: + # Convert attention mask to binary: + attention_mask = attention_mask < 0.5 return attention_mask, loss_mask, position_ids @@ -381,3 +387,16 @@ def get_iterator_k_split(batch: List[torch.Tensor], num_microbatches: int) -> It microbatches = [[elem[i] for elem in split_batch] for i in range(num_microbatches)] return itertools.chain(microbatches) + + +def _cast_if_autocast_enabled(tensor): + if torch.is_autocast_enabled(): + if isinstance(tensor, torch.Tensor): + if tensor.device.type == 'cuda': + dtype = torch.get_autocast_gpu_dtype() + elif tensor.device.type == 'cpu': + dtype = torch.get_autocast_cpu_dtype() + else: + raise NotImplementedError() + return tensor.to(dtype=dtype) + return tensor diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index 310065fc3523..8608c0c9a680 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -97,10 +97,11 @@ def clip_max_len(self, maxlen: int) -> int: pass @abc.abstractclassmethod - def init_batch(self, context_tokens: torch.Tensor, context_length: int): + def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_attention_mask: bool): """initialize the batch data before the inference steps. It will save the intermediate results as object attributes context_length (int): the context token length + compute_attention_mask: bool: set to True to compute attention mask (not needed for FA) Args: context_tokens (torch.Tensor): The padded context tokens including the space for tokens to be generated """ @@ -187,7 +188,7 @@ def clip_max_len(self, maxlen: int) -> int: maxlen = self.model.cfg.encoder_seq_length + 1 return maxlen - def init_batch(self, context_tokens: torch.Tensor, context_length: int): + def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_attention_mask: bool): """initialize the batch data before the inference steps.""" # Move to GPU. tokenizer = self.model.tokenizer @@ -199,10 +200,17 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int): self.model.cfg.get('reset_position_ids', False), self.model.cfg.get('reset_attention_mask', False), self.model.cfg.get('eod_mask_loss', False), + compute_attention_mask=compute_attention_mask, ) def prepare_batch_at_step( - self, tokens: torch.Tensor, maxlen: int, micro_batch_size: int, step: int, context_length: int + self, + tokens: torch.Tensor, + maxlen: int, + micro_batch_size: int, + step: int, + context_length: int, + compute_attention_mask: bool = True, ) -> Tuple[List[torch.Tensor], List[int]]: """ generate the batch used in inference for each of the steps @@ -226,7 +234,10 @@ def prepare_batch_at_step( # types2use = type_ids[:, context_length - 1].view(batch_size, -1) """Prepare batch for each of the inference steps""" - attention_mask_repeat = torch.concat([self.attention_mask for _ in range(micro_batch_size)]) + attention_mask_repeat = None + if compute_attention_mask: + attention_mask_repeat = torch.concat([self.attention_mask for _ in range(micro_batch_size)]) + setkey_value_array = torch.tensor( [set_inference_key_value_memory] * micro_batch_size, device=torch.cuda.current_device() ) @@ -243,7 +254,7 @@ def __init__(self, model, task_ids): self.task_ids = task_ids self.forward_model = self.model - def init_batch(self, context_tokens: torch.Tensor, context_length: int): + def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_attention_mask: bool): """initialize the batch data before the inference steps.""" # Move to GPU. tokenizer = self.model.tokenizer @@ -255,6 +266,7 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int): self.model.cfg.get('reset_position_ids', False), self.model.cfg.get('reset_attention_mask', False), self.model.cfg.get('eod_mask_loss', False), + compute_attention_mask=compute_attention_mask, ) def clip_max_len(self, maxlen: int) -> int: @@ -264,7 +276,13 @@ def clip_max_len(self, maxlen: int) -> int: return maxlen def prepare_batch_at_step( - self, tokens: torch.Tensor, maxlen: int, micro_batch_size: int, step: int, context_length: int + self, + tokens: torch.Tensor, + maxlen: int, + micro_batch_size: int, + step: int, + context_length: int, + compute_attention_mask: bool, ) -> Tuple[List[torch.Tensor], List[int]]: # types2use = None if step == 0: @@ -285,7 +303,9 @@ def prepare_batch_at_step( # types2use = type_ids[:, context_length - 1].view(batch_size, -1) """Prepare batch for each of the inference steps""" - attention_mask_repeat = torch.concat([self.attention_mask for _ in range(micro_batch_size)]) + attention_mask_repeat = None + if compute_attention_mask: + attention_mask_repeat = torch.concat([self.attention_mask for _ in range(micro_batch_size)]) setkey_value_array = torch.tensor( [set_inference_key_value_memory] * micro_batch_size, device=torch.cuda.current_device() ) diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py index a56304970bdc..6417f887c0cd 100644 --- a/nemo/collections/nlp/modules/common/text_generation_utils.py +++ b/nemo/collections/nlp/modules/common/text_generation_utils.py @@ -105,7 +105,7 @@ def megatron_gpt_generate(model, inputs, tokenizer, length_params, sampling_para greedy=sampling_params['use_greedy'], repetition_penalty=sampling_params['repetition_penalty'], min_tokens_to_generate=length_params['min_length'], - **strategy_args, + compute_attention_mask=sampling_params.get("compute_attention_mask", True) ** strategy_args, ) compute_prob_response = get_computeprob_response(tokenizer, response, inputs) return compute_prob_response @@ -376,6 +376,7 @@ def synced_generate( top_k=0, top_p=0.0, greedy=False, + compute_attention_mask=True, compute_logprob=False, repetition_penalty=1.2, min_tokens_to_generate=0, @@ -401,6 +402,7 @@ def synced_generate( context_length_tensor, tokens_to_generate, all_probs, + compute_attention_mask=compute_attention_mask, compute_logprob=compute_logprob, temperature=temperature, end_strings=end_strings, @@ -469,6 +471,7 @@ def generate( top_k=0, top_p=0.0, greedy=False, + compute_attention_mask=True, compute_logprob=False, repetition_penalty=1.0, min_tokens_to_generate=0, @@ -550,6 +553,7 @@ def generate( tokens_to_generate, all_probs, temperature, + compute_attention_mask=compute_attention_mask, compute_logprob=compute_logprob, top_k=top_k, top_p=top_p, @@ -635,6 +639,7 @@ def sample_sequence_batch( context_lengths, tokens_to_generate, all_probs=False, + compute_attention_mask=True, compute_logprob=False, type_ids=None, temperature=None, @@ -666,7 +671,7 @@ def sample_sequence_batch( # initialize the batch with torch.no_grad(): context_length = context_lengths.min().item() - inference_strategy.init_batch(context_tokens, context_length) + inference_strategy.init_batch(context_tokens, context_length, compute_attention_mask) # added eos_id to support the function generate_samples_eval that passes # eos_id as an argument and needs termination when that id id found. eod_id = tokenizer.eos_id @@ -685,7 +690,7 @@ def sample_sequence_batch( lengths = torch.ones([batch_size]).long().cuda() * maxlen while context_length < maxlen: batch, tensor_shape = inference_strategy.prepare_batch_at_step( - tokens, maxlen, micro_batch_size, counter, context_length + tokens, maxlen, micro_batch_size, counter, context_length, compute_attention_mask ) output = inference_strategy.forward_step(batch, tensor_shape) diff --git a/scripts/asr_language_modeling/ngram_lm/create_lexicon_from_arpa.py b/scripts/asr_language_modeling/ngram_lm/create_lexicon_from_arpa.py index 6e992f5348ae..22c657b25613 100644 --- a/scripts/asr_language_modeling/ngram_lm/create_lexicon_from_arpa.py +++ b/scripts/asr_language_modeling/ngram_lm/create_lexicon_from_arpa.py @@ -1,76 +1,77 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Use this file to create a lexicon file for Flashlight decoding from an existing KenLM arpa file -# A lexicon file is required for Flashlight decoding in most cases, as it acts as a map from the words -# in you arpa file to the representation used by your ASR AM. -# For more details, see: https://github.com/flashlight/flashlight/tree/main/flashlight/app/asr#data-preparation -# -# Usage: python create_lexicon_from_arpa.py --arpa /path/to/english.arpa --model /path/to/model.nemo --lower -# -# - - -import argparse -import os -import re - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Utility script for generating lexicon file from a KenLM arpa file") - parser.add_argument("--arpa", required=True, help="path to your arpa file") - parser.add_argument("--dst", help="directory to store generated lexicon", default=None) - parser.add_argument("--lower", action='store_true', help="Whether to lowercase the arpa vocab") - parser.add_argument("--model", default=None, help="path to Nemo model for its tokeniser") - - args = parser.parse_args() - - if not os.path.exists(args.arpa): - print("ARPA file not detected on disk, aborting!", flush=True) - exit(255) - - if args.dst is not None: - save_path = args.dst - else: - save_path = os.path.dirname(args.arpa) - os.makedirs(save_path, exist_ok=True) - - tokenizer = None - if args.model is not None: - from nemo.collections.asr.models import ASRModel - - model = ASRModel.restore_from(restore_path=args.model, map_location='cpu') - if hasattr(model, 'tokenizer'): - tokenizer = model.tokenizer - else: - print('WARNING: supplied Nemo model does not contain a tokenizer', flush=True) - - lex_file = os.path.join(save_path, os.path.splitext(os.path.basename(args.arpa))[0] + '.lexicon') - print(f"Writing Lexicon file - {lex_file}...", flush=True) - with open(lex_file, "w", encoding='utf_8', newline='\n') as f: - with open(args.arpa, "r", encoding='utf_8') as arpa: - for line in arpa: - # verify if the line corresponds to unigram - if not re.match(r"[-]*[0-9\.]+\t\S+\t*[-]*[0-9\.]*$", line): - continue - word = line.split("\t")[1] - word = word.strip().lower() if args.lower else word.strip() - if word == "" or word == "" or word == "" or word == "": - continue - - if tokenizer is None: - f.write("{w}\t{s}\n".format(w=word, s=" ".join(word))) - else: - f.write("{w}\t{s}\n".format(w=word, s=" ".join(tokenizer.text_to_tokens(word)))) - - print("Done!", flush=True) +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Use this file to create a lexicon file for Flashlight decoding from an existing KenLM arpa file +# A lexicon file is required for Flashlight decoding in most cases, as it acts as a map from the words +# in you arpa file to the representation used by your ASR AM. +# For more details, see: https://github.com/flashlight/flashlight/tree/main/flashlight/app/asr#data-preparation +# +# Usage: python create_lexicon_from_arpa.py --arpa /path/to/english.arpa --model /path/to/model.nemo --lower +# +# + + +import argparse +import os +import re + +from nemo.utils import logging + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Utility script for generating lexicon file from a KenLM arpa file") + parser.add_argument("--arpa", required=True, help="path to your arpa file") + parser.add_argument("--dst", help="directory to store generated lexicon", default=None) + parser.add_argument("--lower", action='store_true', help="Whether to lowercase the arpa vocab") + parser.add_argument("--model", default=None, help="path to Nemo model for its tokeniser") + + args = parser.parse_args() + + if not os.path.exists(args.arpa): + logging.critical(f"ARPA file [ {args.arpa} ] not detected on disk, aborting!") + exit(255) + + if args.dst is not None: + save_path = args.dst + else: + save_path = os.path.dirname(args.arpa) + os.makedirs(save_path, exist_ok=True) + + tokenizer = None + if args.model is not None: + from nemo.collections.asr.models import ASRModel + + model = ASRModel.restore_from(restore_path=args.model, map_location='cpu') + if hasattr(model, 'tokenizer'): + tokenizer = model.tokenizer + else: + logging.warning('Supplied Nemo model does not contain a tokenizer') + + lex_file = os.path.join(save_path, os.path.splitext(os.path.basename(args.arpa))[0] + '.lexicon') + + logging.info(f"Writing Lexicon file to: {lex_file}...") + with open(lex_file, "w", encoding='utf_8', newline='\n') as f: + with open(args.arpa, "r", encoding='utf_8') as arpa: + for line in arpa: + # verify if the line corresponds to unigram + if not re.match(r"[-]*[0-9\.]+\t\S+\t*[-]*[0-9\.]*$", line): + continue + word = line.split("\t")[1] + word = word.strip().lower() if args.lower else word.strip() + if word == "" or word == "" or word == "" or word == "": + continue + + if tokenizer is None: + f.write("{w}\t{s}\n".format(w=word, s=" ".join(word))) + else: + f.write("{w}\t{s}\n".format(w=word, s=" ".join(tokenizer.text_to_tokens(word)))) diff --git a/scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py b/scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py new file mode 100644 index 000000000000..14d7b6ae54ea --- /dev/null +++ b/scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py @@ -0,0 +1,212 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +A script to convert the Mosaic MPT-7B checkpoint on HuggingFace to Megatron GPTModel +This script is hardcoded specifically for the MPT-7B pretrained model only, and is not +generalisable to any other models. + +This script will load and convert the model entirely on CPU for OOM safety, but there +is an option to put the model onto GPU before the save down, which sets the map_location +to cuda for the restore_from call. You can do this by adding --cuda to this script call. + +This script requires that you have downloaded the 2 .bin weight files for MPT-7B from +HuggingFace located here: https://huggingface.co/mosaicml/mpt-7b/tree/main +These files MUST have the following file names and be saved somewhere where this script +can read them: + pytorch_model-00001-of-00002.bin + pytorch_model-00002-of-00002.bin + +This script will generate a Megatron model with TP=1 and PP=1. If you need different TP/PP +values, then after running this script, please use the script located below to set whatever +TP/PP values you want: + NeMo/examples/nlp/language_modeling/megatron_change_num_partitions.py + + +Here is an example usage command: + +```python +python scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py -i /path/to/mpt_7b -o /path/to/save +``` + +""" + + +import argparse +import os + +import pytorch_lightning as pl +import torch +from omegaconf import OmegaConf + +from nemo.collections.nlp.models.language_modeling.megatron import GPTModel +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.utils import logging + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '-i', '--input', required=True, type=str, help='path to the two MPT-7B .bin weight files from HuggingFace' + ) + parser.add_argument( + '-o', '--output', required=False, default=None, type=str, help='path to dir where to store output .nemo file' + ) + parser.add_argument('--cuda', action='store_true', help='put Nemo model onto GPU prior to savedown') + + args = parser.parse_args() + + if not os.path.exists(args.input): + logging.critical(f'Input directory [ {args.input} ] does not exist or cannot be found. Aborting.') + exit(255) + + model_dict = { + 'micro_batch_size': 4, + 'global_batch_size': 8, + 'rampup_batch_size': None, + 'tensor_model_parallel_size': 1, + 'pipeline_model_parallel_size': 1, + 'virtual_pipeline_model_parallel_size': None, + 'megatron_amp_O2': True, + 'transformer_engine': False, + 'use_cpu_initialization': True, + 'hidden_size': 4096, + 'max_position_embeddings': 2048, + 'num_layers': 32, + 'num_attention_heads': 32, + 'ffn_hidden_size': 4 * 4096, + 'precision': 'bf16', + 'pre_process': True, + 'post_process': True, + 'num_tokentypes': 0, + 'apply_query_key_layer_scaling': False, + 'parallel_output': False, + 'bias': False, + 'bias_dropout_add_fusion': False, + 'bias_activation_fusion': False, + 'transformer_block_type': 'pre_ln', + 'normalization': 'low_precision_layernorm', + 'fp32_residual_connection': False, + 'hidden_dropout': 0, + 'attention_dropout': 0, + 'ffn_dropout': 0, + 'megatron_legacy': True, + 'share_embeddings_and_output_weights': True, + 'sequence_parallel': False, + 'position_embedding_type': 'alibi', + 'normalize_attention_scores': True, + 'use_flash_attention': False, + 'override_vocab_size': 50432, + } + tokeniser_dict = { + 'library': 'huggingface', + 'type': 'EleutherAI/gpt-neox-20b', + 'use_fast': True, + } + optim_dict = { + 'name': 'fused_adam', + 'lr': 2e-4, + 'weight_decay': 0.01, + } + trainer_dict = { + 'devices': 1, + 'num_nodes': 1, + 'accelerator': 'gpu' if args.cuda else 'cpu', + 'precision': 'bf16', + 'logger': False, # logger provided by exp_manager + 'enable_checkpointing': False, + 'replace_sampler_ddp': False, + 'max_epochs': -1, # PTL default. In practice, max_steps will be reached first. + 'max_steps': 100000, # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + 'log_every_n_steps': 10, + 'val_check_interval': 100, + 'limit_val_batches': 50, + 'limit_test_batches': 500, + 'accumulate_grad_batches': 1, + 'gradient_clip_val': 1.0, + 'benchmark': False, + 'enable_model_summary': False, + } + + model_dict['tokenizer'] = tokeniser_dict + model_dict['optim'] = optim_dict + + omega_cfg = OmegaConf.create(model_dict) + + trainer = pl.Trainer(**trainer_dict) + + model = MegatronGPTModel(omega_cfg, trainer) + + model_keys = list(model.state_dict().keys()) + model_dtypes = list(set([model.state_dict()[x].dtype for x in model_keys])) + + if not (len(model_dtypes) == 1 and model_dtypes[0] is torch.bfloat16): + model = model.bfloat16() + + if args.cuda: + model = model.cuda() + + mpt_1 = torch.load(os.path.join(args.input, 'pytorch_model-00001-of-00002.bin'), map_location="cpu") + mpt_2 = torch.load(os.path.join(args.input, 'pytorch_model-00002-of-00002.bin'), map_location="cpu") + mpt_dict = {**mpt_1, **mpt_2} + del mpt_1, mpt_2 + + def convert_state_dict(state_dict, amp=False): + def get_new_key(old_key): + if old_key == 'transformer.wte.weight': + return 'language_model.embedding.word_embeddings.weight' + elif old_key == 'transformer.norm_f.weight': + return 'language_model.encoder.final_layernorm.weight' + else: + p1 = old_key.replace('transformer.blocks.', 'language_model.encoder.layers.') + p2 = p1.replace('norm_1.weight', 'input_layernorm.weight') + p3 = p2.replace('attn.Wqkv.weight', 'self_attention.query_key_value.weight') + p4 = p3.replace('attn.out_proj.weight', 'self_attention.dense.weight') + p5 = p4.replace('norm_2.weight', 'post_attention_layernorm.weight') + p6 = p5.replace('ffn.up_proj.weight', 'mlp.dense_h_to_4h.weight') + p7 = p6.replace('ffn.down_proj.weight', 'mlp.dense_4h_to_h.weight') + + return p7 + + new_dict = {} + + for old_key, val in state_dict.items(): + new_key = get_new_key(old_key) + if amp: + new_key = 'module.' + new_key + + new_dict[new_key] = val + + return new_dict + + convert_dict = convert_state_dict(mpt_dict, amp=model_dict['megatron_amp_O2']) + + if model_dict['megatron_amp_O2']: + missing_keys, unexpected_keys = model.model.load_state_dict(convert_dict, strict=True) + else: + missing_keys, unexpected_keys = super(GPTModel, model.model).load_state_dict(convert_dict, strict=True) + + if len(missing_keys) > 0: + logging.critical('Missing keys were detected during the load, something has gone wrong. Aborting.') + logging.critical(f'Missing keys: \n{missing_keys}') + exit(255) + + if len(unexpected_keys) > 0: + logging.warning('Unexpected keys were detected which should not happen. Please investigate.') + logging.warning(f'Unexpected keys: \n{unexpected_keys}') + + if args.output is None: + args.output = os.path.dirname(os.path.abspath(__file__)) + + model.save_to(os.path.join(args.output, 'megatron_mpt_7b_base_tp1_pp1.nemo')) diff --git a/tests/collections/nlp/test_flash_attention.py b/tests/collections/nlp/test_flash_attention.py new file mode 100644 index 000000000000..cead91ff312a --- /dev/null +++ b/tests/collections/nlp/test_flash_attention.py @@ -0,0 +1,247 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random + +import pytest +import torch +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.modules.common.megatron.attention import CoreAttention +from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo +from nemo.collections.nlp.modules.common.megatron.utils import build_attention_mask_3d +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy + +try: + from apex.transformer.enums import AttnMaskType + + HAVE_APEX = True +except (ImportError, ModuleNotFoundError): + HAVE_APEX = False + +try: + import flash_attn + + HAVE_FA = True +except (ImportError, ModuleNotFoundError): + HAVE_FA = False + +try: + import triton + + HAVE_TRITON = True +except (ImportError, ModuleNotFoundError): + HAVE_TRITON = False + +import pynvml + + +def HAVE_AMPERE_GPU(): + pynvml.nvmlInit() + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + device_arch = pynvml.nvmlDeviceGetArchitecture(handle) + pynvml.nvmlShutdown() + return device_arch == pynvml.NVML_DEVICE_ARCH_AMPERE + + +@pytest.mark.run_only_on('GPU') +@pytest.mark.skipif(not HAVE_APEX, reason="apex is not installed") +class TestFlashAttention: + @classmethod + def setup_class(cls): + if not torch.cuda.is_available(): + return + + GPUS = 1 + TP_SIZE = GPUS + PP_SIZE = 1 + MB_SIZE = 4 + GB_SIZE = 8 + SEED = 1234 + trainer = Trainer(strategy=NLPDDPStrategy(), devices=GPUS, accelerator='gpu', num_nodes=1, logger=None,) + + initialize_model_parallel_for_nemo( + world_size=trainer.world_size, + global_rank=trainer.global_rank, + local_rank=trainer.local_rank, + tensor_model_parallel_size=TP_SIZE, + pipeline_model_parallel_size=PP_SIZE, + micro_batch_size=MB_SIZE, + global_batch_size=GB_SIZE, + seed=SEED, + apex_transformer_log_level=30, + ) + + @pytest.fixture() + def cfg(self): + cfg = { + 'bz': random.randint(1, 7), + 'sl': random.randint(1, 7), + 'head': random.randint(1, 7), + 'device': torch.cuda.current_device(), + } + # flash attention requires head dimensions are multiples of 8 + head_dim = random.randint(1, 7) * 8 + cfg['hidden'] = cfg['head'] * head_dim + + return cfg + + @pytest.mark.skipif(not HAVE_FA, reason="flash-attention is not installed") + @pytest.mark.unit + def test_flash_attention(self, cfg): + device = cfg['device'] + bz, sl, np, h = cfg['bz'], cfg['sl'], cfg['head'], cfg['hidden'] + hn = h // np + + q = torch.rand(sl, bz, np, hn, device=device).half() + k = torch.rand(sl, bz, np, hn, device=device).half() + v = torch.rand(sl, bz, np, hn, device=device).half() + + attention_mask_2d = torch.arange(sl, device=device).unsqueeze(0) < torch.randint( + 1, sl, (bz,), device=device + ).unsqueeze(1) + + attention_mask_padding_3d = build_attention_mask_3d( + source_mask=attention_mask_2d, target_mask=attention_mask_2d, attn_mask_type=AttnMaskType.padding + ).unsqueeze(1) + + attention_mask_causal_3d = build_attention_mask_3d( + source_mask=attention_mask_2d, target_mask=attention_mask_2d, attn_mask_type=AttnMaskType.causal + ).unsqueeze(1) + + # Non-causal + attention = CoreAttention( + layer_number=1, + num_attention_heads=np, + hidden_size=h, + attn_mask_type=AttnMaskType.padding, + attention_dropout=0.0, + ) + + attention_fa = CoreAttention( + layer_number=1, + num_attention_heads=np, + hidden_size=h, + attn_mask_type=AttnMaskType.padding, + attention_dropout=0.0, + use_flash_attention=True, + ) + + out = attention(q, k, v, attention_mask_padding_3d) + out_fa = attention_fa(q, k, v, attention_mask_padding_3d) + assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + out_fa = attention_fa(q, k, v, attention_mask_2d) + assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + + # Causal + attention = CoreAttention( + layer_number=1, + num_attention_heads=np, + hidden_size=h, + attn_mask_type=AttnMaskType.causal, + attention_dropout=0.0, + ) + + attention_fa = CoreAttention( + layer_number=1, + num_attention_heads=np, + hidden_size=h, + attn_mask_type=AttnMaskType.causal, + attention_dropout=0.0, + use_flash_attention=True, + ) + + out = attention(q, k, v, attention_mask_causal_3d) + out_fa = attention_fa(q, k, v, attention_mask_causal_3d) + assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + out_fa = attention_fa(q, k, v, attention_mask_2d) + assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + + @pytest.mark.skipif(not HAVE_FA, reason="flash-attention is not installed") + @pytest.mark.skipif(not HAVE_TRITON, reason="triton is not installed") + @pytest.mark.skipif( + not HAVE_AMPERE_GPU(), + reason="should only run on AMPERE GPU. Please see https://github.com/HazyResearch/flash-attention/issues/245", + ) + @pytest.mark.unit + def test_flash_attention_triton(self, cfg): + device = cfg['device'] + bz, sl, np, h = cfg['bz'], cfg['sl'], cfg['head'], cfg['hidden'] + hn = h // np + + q = torch.rand(sl, bz, np, hn, device=device).half() + k = torch.rand(sl, bz, np, hn, device=device).half() + v = torch.rand(sl, bz, np, hn, device=device).half() + + attention_mask_2d = torch.arange(sl, device=device).unsqueeze(0) < torch.randint( + 1, sl, (bz,), device=device + ).unsqueeze(1) + + attention_mask_padding_3d = build_attention_mask_3d( + source_mask=attention_mask_2d, target_mask=attention_mask_2d, attn_mask_type=AttnMaskType.padding + ).unsqueeze(1) + + attention_mask_causal_3d = build_attention_mask_3d( + source_mask=attention_mask_2d, target_mask=attention_mask_2d, attn_mask_type=AttnMaskType.causal + ).unsqueeze(1) + + attention_bias = torch.rand(bz, np, sl, sl, device=device) + + # Non-causal + attention = CoreAttention( + layer_number=1, + num_attention_heads=np, + hidden_size=h, + attn_mask_type=AttnMaskType.padding, + attention_dropout=0.0, + ) + + attention_fa = CoreAttention( + layer_number=1, + num_attention_heads=np, + hidden_size=h, + attn_mask_type=AttnMaskType.padding, + attention_dropout=0.0, + use_flash_attention=True, + ) + + out = attention(q, k, v, attention_mask_padding_3d, relative_position_bias=attention_bias) + out_fa = attention_fa(q, k, v, attention_mask_padding_3d, relative_position_bias=attention_bias) + assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + out_fa = attention_fa(q, k, v, attention_mask_2d, relative_position_bias=attention_bias) + assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + + # Causal + attention = CoreAttention( + layer_number=1, + num_attention_heads=np, + hidden_size=h, + attn_mask_type=AttnMaskType.causal, + attention_dropout=0.0, + ) + + attention_fa = CoreAttention( + layer_number=1, + num_attention_heads=np, + hidden_size=h, + attn_mask_type=AttnMaskType.causal, + attention_dropout=0.0, + use_flash_attention=True, + ) + + out = attention(q, k, v, attention_mask_causal_3d, relative_position_bias=attention_bias) + out_fa = attention_fa(q, k, v, attention_mask_causal_3d, relative_position_bias=attention_bias) + assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + out_fa = attention_fa(q, k, v, attention_mask_2d, relative_position_bias=attention_bias) + assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) diff --git a/tests/collections/nlp/test_position_embedding.py b/tests/collections/nlp/test_position_embedding.py new file mode 100644 index 000000000000..263ca8669d81 --- /dev/null +++ b/tests/collections/nlp/test_position_embedding.py @@ -0,0 +1,211 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random + +import pytest +import torch + +from nemo.collections.nlp.modules.common.megatron.layer_type import LayerType +from nemo.collections.nlp.modules.common.megatron.position_embedding import ( + ALiBiRelativePositionEmbedding, + KERPLERelativePositionEmbedding, + RotaryEmbedding, + SandwichRelativePositionEmbedding, + T5RelativePositionEmbedding, + XPOSPositionEmbedding, +) +from nemo.collections.nlp.modules.common.megatron.position_embedding.rotary_position_embedding import ( + apply_rotary_pos_emb, +) +from nemo.collections.nlp.modules.common.megatron.utils import init_method_normal + + +@pytest.fixture() +def cfg(): + cfg = { + 'max_seq_len': 8, + 'num_attention_heads': 2, + 'layer_type': LayerType.encoder, + 'hidden_size': 4, + 'rpe_init_method_std': 0.02, + 'rpe_num_buckets': 6, + 'rpe_max_distance': 16, + } + return cfg + + +@pytest.mark.unit +def test_alibi(cfg): + # non-causal + PE_nc = ALiBiRelativePositionEmbedding( + bidirectional=True, + num_attention_heads=cfg['num_attention_heads'], + layer_type=cfg['layer_type'], + max_seq_len=cfg['max_seq_len'], + ) + + # causal + PE_c = ALiBiRelativePositionEmbedding( + bidirectional=False, + num_attention_heads=cfg['num_attention_heads'], + layer_type=cfg['layer_type'], + max_seq_len=cfg['max_seq_len'], + ) + + q_len = k_len = random.randint(1, cfg['max_seq_len'] * 2) + + bias_nc = PE_nc(q_len, k_len) + assert bias_nc.shape == (1, cfg['num_attention_heads'], q_len, k_len) + assert torch.equal(bias_nc, bias_nc.transpose(2, 3)) + + bias_c = PE_c(q_len, k_len) + assert bias_c.shape == (1, cfg['num_attention_heads'], 1, k_len) + assert torch.equal(bias_c, bias_nc[:, :, -1:, :]) + + +@pytest.mark.unit +def test_sandwich(cfg): + # non-causal + PE_nc = SandwichRelativePositionEmbedding( + bidirectional=True, + num_attention_heads=cfg['num_attention_heads'], + layer_type=cfg['layer_type'], + max_seq_len=cfg['max_seq_len'], + hidden_size=cfg['hidden_size'], + ) + + # causal + PE_c = SandwichRelativePositionEmbedding( + bidirectional=False, + num_attention_heads=cfg['num_attention_heads'], + layer_type=cfg['layer_type'], + max_seq_len=cfg['max_seq_len'], + hidden_size=cfg['hidden_size'], + ) + + q_len = k_len = random.randint(1, cfg['max_seq_len'] * 2) + + bias_nc = PE_nc(q_len, k_len) + assert bias_nc.shape == (1, cfg['num_attention_heads'], q_len, k_len) + assert torch.equal(bias_nc, bias_nc.transpose(2, 3)) + + bias_c = PE_c(q_len, k_len) + assert bias_c.shape == (1, cfg['num_attention_heads'], q_len, k_len) + assert torch.all(torch.triu(bias_c, diagonal=0) == 0) + + +@pytest.mark.unit +def test_kerple(cfg): + # non-causal + PE_nc = KERPLERelativePositionEmbedding( + bidirectional=True, + num_attention_heads=cfg['num_attention_heads'], + layer_type=cfg['layer_type'], + max_seq_len=cfg['max_seq_len'], + ) + + # causal + PE_c = KERPLERelativePositionEmbedding( + bidirectional=False, + num_attention_heads=cfg['num_attention_heads'], + layer_type=cfg['layer_type'], + max_seq_len=cfg['max_seq_len'], + ) + + q_len = k_len = random.randint(1, cfg['max_seq_len'] * 2) + + bias_nc = PE_nc(q_len, k_len) + assert bias_nc.shape == (1, cfg['num_attention_heads'], q_len, k_len) + assert torch.equal(bias_nc, bias_nc.transpose(2, 3)) + + bias_c = PE_c(q_len, k_len) + assert bias_c.shape == (1, cfg['num_attention_heads'], q_len, k_len) + assert torch.all(torch.triu(bias_c, diagonal=0) == 0) + + +@pytest.mark.unit +def test_t5relative(cfg): + # non-causal + PE_nc = T5RelativePositionEmbedding( + bidirectional=True, + num_attention_heads=cfg['num_attention_heads'], + layer_type=cfg['layer_type'], + init_method=init_method_normal(cfg['rpe_init_method_std']), + relative_position_num_buckets=cfg['rpe_num_buckets'], + relative_position_max_distance=cfg['rpe_max_distance'], + ) + + # causal + PE_c = T5RelativePositionEmbedding( + bidirectional=False, + num_attention_heads=cfg['num_attention_heads'], + layer_type=cfg['layer_type'], + init_method=init_method_normal(cfg['rpe_init_method_std']), + relative_position_num_buckets=cfg['rpe_num_buckets'], + relative_position_max_distance=cfg['rpe_max_distance'], + ) + + q_len = k_len = random.randint(1, cfg['max_seq_len'] * 2) + + bias_nc = PE_nc(q_len, k_len) + assert bias_nc.shape == (1, cfg['num_attention_heads'], q_len, k_len) + + bias_c = PE_c(q_len, k_len) + assert bias_c.shape == (1, cfg['num_attention_heads'], q_len, k_len) + assert ( + len(torch.triu(bias_c, diagonal=0).unique()) == cfg['num_attention_heads'] + 1 + if q_len > 1 + else cfg['num_attention_heads'] + ) + + +@pytest.mark.unit +def test_rotary(cfg): + PE = RotaryEmbedding(dim=cfg['hidden_size']) + rotary_embedding = PE(cfg['max_seq_len']) + + x = torch.rand(cfg['max_seq_len'], 1, cfg['num_attention_heads'], cfg['hidden_size']) + x_rotary = apply_rotary_pos_emb(x, rotary_embedding) + assert x_rotary.shape == x.shape + + hd = cfg['hidden_size'] // 2 + x_rotary_test = torch.cat( + ( + x[..., :hd] * rotary_embedding[..., :hd].cos() + x[..., hd:] * rotary_embedding[..., hd:].sin() * -1, + x[..., :hd] * rotary_embedding[..., :hd].sin() + x[..., hd:] * rotary_embedding[..., hd:].cos(), + ), + dim=-1, + ) + assert torch.equal(x_rotary, x_rotary_test) + + offset = random.choice(range(1, cfg['max_seq_len'])) + rotary_embedding_offset = PE(cfg['max_seq_len'], offset=offset) + x_rotary = apply_rotary_pos_emb(x[: offset + 1], rotary_embedding[: offset + 1]) + x_rotary_offset = apply_rotary_pos_emb(x[offset : offset + 1], rotary_embedding_offset[:1]) + assert torch.equal(x_rotary[-1], x_rotary_offset[0]) + + +@pytest.mark.unit +def test_xpos(cfg): + PE = XPOSPositionEmbedding(head_dim=cfg['hidden_size']) + x = torch.rand(cfg['max_seq_len'], 1, cfg['num_attention_heads'], cfg['hidden_size']) + + x_rotary = PE(x) + assert x_rotary.shape == x.shape + + offset = random.choice(range(1, cfg['max_seq_len'])) + x_rotary = PE(x[: offset + 1]) + x_rotary_offset = PE(x[offset : offset + 1], offset=offset) + assert torch.equal(x_rotary[-1], x_rotary_offset[0]) diff --git a/tests/collections/nlp/test_retrieval_module.py b/tests/collections/nlp/test_retrieval_module.py index 3a2d46f4fed2..08425964e566 100644 --- a/tests/collections/nlp/test_retrieval_module.py +++ b/tests/collections/nlp/test_retrieval_module.py @@ -21,6 +21,7 @@ from nemo.collections.nlp.modules.common.megatron.attention import ParallelChunkedCrossAttention from nemo.collections.nlp.modules.common.megatron.layer_type import LayerType from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo +from nemo.collections.nlp.modules.common.megatron.position_embedding import RotaryEmbedding from nemo.collections.nlp.modules.common.megatron.retrieval_token_level_encoder_decoder import ( MegatronRetrievalTokenLevelEncoderDecoderModule, ) @@ -28,7 +29,6 @@ MegatronRetrievalTransformerDecoderModule, MegatronRetrievalTransformerEncoderModule, ) -from nemo.collections.nlp.modules.common.megatron.rotary_pos_embedding import RotaryEmbedding from nemo.collections.nlp.modules.common.megatron.utils import ( build_attention_mask_3d, init_method_normal, diff --git a/tests/collections/nlp/test_retrieval_module_inference.py b/tests/collections/nlp/test_retrieval_module_inference.py index 16e7e556bd10..a9aa002815b2 100644 --- a/tests/collections/nlp/test_retrieval_module_inference.py +++ b/tests/collections/nlp/test_retrieval_module_inference.py @@ -22,6 +22,7 @@ from nemo.collections.nlp.modules.common.megatron.attention import ParallelChunkedCrossAttention from nemo.collections.nlp.modules.common.megatron.layer_type import LayerType from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo +from nemo.collections.nlp.modules.common.megatron.position_embedding import RotaryEmbedding from nemo.collections.nlp.modules.common.megatron.retrieval_token_level_encoder_decoder import ( MegatronRetrievalTokenLevelEncoderDecoderModule, ) @@ -29,7 +30,6 @@ MegatronRetrievalTransformerDecoderModule, MegatronRetrievalTransformerEncoderModule, ) -from nemo.collections.nlp.modules.common.megatron.rotary_pos_embedding import RotaryEmbedding from nemo.collections.nlp.modules.common.megatron.utils import ( build_attention_mask_3d, init_method_normal, From b54e7fd6b0a8197270a6e33e262d40083d213254 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 13 Jun 2023 09:53:48 -0700 Subject: [PATCH 038/123] fix (#6842) (#6843) Signed-off-by: Yi Dong Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com> --- tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb b/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb index e1aa32f7bbf1..b7ae11ef3f5d 100644 --- a/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb +++ b/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb @@ -586,7 +586,7 @@ "outputs": [], "source": [ "CHECKPONT_FILE_NAME = megatron_gpt--val_loss=1.17-step=10047-consumed_samples=80376.0-last.ckpt # change it to your checkpoint file name\n", - "!python -m torch.distributed.launch --nproc_per_node=1 megatron_ckpt_to_nemo.py \\\n", + "!torchrun --nproc_per_node=1 megatron_ckpt_to_nemo.py \\\n", " --checkpoint_folder=gpt_creditcard_results/megatron_gpt/checkpoints/ \\\n", " --checkpoint_name={CHECKPONT_FILE_NAME} \\\n", " --nemo_file_path=tabular.nemo \\\n", From 02c30689a9bfc3ca3950e010525610557979c646 Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Tue, 13 Jun 2023 13:28:19 -0400 Subject: [PATCH 039/123] Add Frame-VAD to ASR+VAD pipeline (#6464) * add model, dataset, necessary utils and tests Signed-off-by: stevehuang52 * fix tarred data Signed-off-by: stevehuang52 * fix typo Signed-off-by: stevehuang52 * add fvad examples and update utils Signed-off-by: stevehuang52 * add copyright Signed-off-by: stevehuang52 * add frame-vad to ASR+VAD pipeline, add drop-frame mode Signed-off-by: stevehuang52 * fix typo Signed-off-by: stevehuang52 * update doc Signed-off-by: stevehuang52 * fix masking Signed-off-by: stevehuang52 * update doc Signed-off-by: stevehuang52 * slight refactor Signed-off-by: stevehuang52 * fix rnnt output Signed-off-by: stevehuang52 * add support for hybrid model Signed-off-by: stevehuang52 * update tutorial Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * fix typo Signed-off-by: stevehuang52 * merge frame- and segment-vad scripts Signed-off-by: stevehuang52 * update tutorial Signed-off-by: stevehuang52 * update doc Signed-off-by: stevehuang52 * update doc Signed-off-by: stevehuang52 --------- Signed-off-by: stevehuang52 Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> --- examples/asr/asr_vad/README.md | 27 ++- .../asr/asr_vad/speech_to_text_with_vad.py | 116 +++++++---- .../conf/vad/frame_vad_infer_postprocess.yaml | 2 +- .../speech_classification/frame_vad_infer.py | 6 +- .../speech_to_frame_label.py | 9 + nemo/collections/asr/data/feature_to_label.py | 185 ++++++++++++++++-- .../asr/data/feature_to_label_dataset.py | 26 ++- nemo/collections/asr/data/feature_to_text.py | 91 ++++++--- .../asr/data/feature_to_text_dataset.py | 4 + .../Offline_ASR_with_VAD_for_CTC_models.ipynb | 14 +- tutorials/asr/Voice_Activity_Detection.ipynb | 23 ++- 11 files changed, 414 insertions(+), 89 deletions(-) diff --git a/examples/asr/asr_vad/README.md b/examples/asr/asr_vad/README.md index 03c7efa146b9..9385b96a79ea 100644 --- a/examples/asr/asr_vad/README.md +++ b/examples/asr/asr_vad/README.md @@ -25,23 +25,30 @@ To run the code with ASR+VAD default settings: ```bash python speech_to_text_with_vad.py \ manifest_filepath=/PATH/TO/MANIFEST.json \ - vad_model=vad_multilingual_marblenet \ + vad_model=vad_multilingual_frame_marblenet \ asr_model=stt_en_conformer_ctc_large \ - vad_config=../conf/vad/vad_inference_postprocess.yaml + vad_config=../conf/vad/frame_vad_infer_postprocess.yaml ``` -To use only ASR and disable VAD, set `vad_model=None` and `use_rttm=False`. +- To use only ASR and disable VAD, set `vad_model=None` and `use_rttm=False`. -To use only VAD, set `asr_model=None` and specify both `vad_model` and `vad_config`. +- To use only VAD, set `asr_model=None` and specify both `vad_model` and `vad_config`. -To enable profiling, set `profiling=True`, but this will significantly slow down the program. +- To enable profiling, set `profiling=True`, but this will significantly slow down the program. -To use or disable feature masking, set `use_rttm` to `True` or `False`. +### Using RTTM to handle non-speech audio segments +- To use or disable RTTM usage, set `use_rttm` to `True` or `False`. There are two options to use RTTM files, as specified by the parameter `rttm_mode`, which must be one of `mask` or `drop`. For `mask`, the RTTM file will be used to mask the non-speech features. For `drop`, the RTTM file will be used to drop the non-speech features. -To normalize feature before masking, set `normalize=pre_norm`, -and set `normalize=post_norm` for masking before normalization. +- It's recommended that for `rttm_mode='drop'`, use larger `pad_onset` and `pad_offset` to avoid dropping speech features. -To use a specific value for feature masking, set `feat_mask_val` to the desired value. +- To use a specific value for feature masking, set `feat_mask_val` to the desired value. Default is `feat_mask_val=None`, where -16.530 (zero log mel-spectrogram value) will be used for `post_norm` and 0 (same as SpecAugment) will be used for `pre_norm`. -See more options in the `InferenceConfig` class. +- To normalize feature before masking, set `normalize=pre_norm`, and set `normalize=post_norm` for masking before normalization. + +### Frame-VAD and Segment-VAD +- By default, `speech_to_text_with_vad.py` and `vad_config=../conf/vad/frame_vad_infer_postprocess.yaml` will use a frame-VAD model, which generates a speech/non-speech prediction for each audio frame of 20ms. +- To use segment-VAD, use `speech_to_text_with_vad.py vad_type='segment' vad_config=../conf/vad/vad_inference_postprocessing.yaml` instead. In segment-VAD, the audio is split into segments and VAD is performed on each segment. The segments are then stitched together to form the final output. The segment size and stride can be specified by `window_length_in_sec` and `shift_length_in_sec` in the VAD config (e.g., `../conf/vad/vad_inference_postprocessing.yaml`) respectively. The default values are 0.63 seconds and 0.08 seconds respectively. + +### More options +- See more options in the `InferenceConfig` data class. diff --git a/examples/asr/asr_vad/speech_to_text_with_vad.py b/examples/asr/asr_vad/speech_to_text_with_vad.py index b22ff709c344..ecdfac42f665 100644 --- a/examples/asr/asr_vad/speech_to_text_with_vad.py +++ b/examples/asr/asr_vad/speech_to_text_with_vad.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,9 +29,9 @@ ```bash python speech_to_text_with_vad.py \ manifest_filepath=/PATH/TO/MANIFEST.json \ - vad_model=vad_multilingual_marblenet \ + vad_model=vad_multilingual_frame_marblenet\ asr_model=stt_en_conformer_ctc_large \ - vad_config=../conf/vad/vad_inference_postprocess.yaml + vad_config=../conf/vad/frame_vad_inference_postprocess.yaml ``` To use only ASR and disable VAD, set `vad_model=None` and `use_rttm=False`. @@ -40,13 +40,15 @@ To enable profiling, set `profiling=True`, but this will significantly slow down the program. -To use or disable feature masking, set `use_rttm` to `True` or `False`. +To use or disable feature masking/droping based on RTTM files, set `use_rttm` to `True` or `False`. +There are two ways to use RTTM files, either by masking the features (`rttm_mode=mask`) or by dropping the features (`rttm_mode=drop`). +For audios that have long non-speech audios between speech segments, dropping frames is recommended. To normalize feature before masking, set `normalize=pre_norm`, and set `normalize=post_norm` for masking before normalization. To use a specific value for feature masking, set `feat_mask_val` to the desired value. -Default is `feat_mask_val=None`, where -16.530 will be used for `post_norm` and 0 will be used for `pre_norm`. +Default is `feat_mask_val=None`, where -16.635 will be used for `post_norm` and 0 will be used for `pre_norm`. See more options in the `InferenceConfig` class. """ @@ -72,10 +74,10 @@ from nemo.collections.asr.models import ASRModel, EncDecClassificationModel from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest from nemo.collections.asr.parts.utils.vad_utils import ( - extract_audio_features, generate_overlap_vad_seq, generate_vad_segment_table, get_vad_stream_status, + init_frame_vad_model, init_vad_model, ) from nemo.core.config import hydra_runner @@ -97,15 +99,16 @@ class InferenceConfig: vad_model: Optional[str] = None # Path to a .nemo file or a pretrained NeMo model on NGC vad_config: Optional[str] = None # Path to a yaml file containing VAD post-processing configs manifest_filepath: Optional[str] = None # Path to dataset's JSON manifest - audio_dir: Optional[str] = None + audio_dir: Optional[str] = None # Path to a directory containing audio files, use this if no manifest is provided use_rttm: bool = True # whether to use RTTM + rttm_mode: str = "mask" # how to use RTTM files, choices=[`mask`, `drop`] feat_mask_val: Optional[float] = None # value used to mask features based on RTTM, set None to use defaults normalize: Optional[ str - ] = "post_norm" # whether and where to normalize feature, choices=[None, `pre_norm`, `post_norm`] + ] = "post_norm" # whether and where to normalize audio feature, choices=[None, `pre_norm`, `post_norm`] normalize_type: str = "per_feature" # how to determine mean and std used for normalization - use_pure_noise: bool = False # whether input is pure noise or not. + normalize_audio_db: Optional[float] = None # set to normalize RMS DB of audio before extracting audio features profiling: bool = False # whether to enable pytorch profiling @@ -113,13 +116,13 @@ class InferenceConfig: batch_size: int = 1 # batch size for ASR. Feature extraction and VAD only support single sample per batch. num_workers: int = 8 sample_rate: int = 16000 - frame_unit_time_secs: float = 0.01 # unit time per frame in seconds, equal to `window_stride` in ASR configs. + frame_unit_time_secs: float = 0.01 # unit time per frame in seconds, equal to `window_stride` in ASR configs, typically 10ms. audio_type: str = "wav" # Output settings, no need to change output_dir: Optional[str] = None # will be automatically set by the program output_filename: Optional[str] = None # will be automatically set by the program - pred_name_postfix: Optional[str] = None # If you need to use another model name, rather than standard one. + pred_name_postfix: Optional[str] = None # If you need to use another model name, other than the standard one. # Set to True to output language ID information compute_langs: bool = False @@ -130,6 +133,9 @@ class InferenceConfig: # Decoding strategy for RNNT models rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1) + # VAD model type + vad_type: str = "frame" # which type of VAD to use, choices=[`frame`, `segment`] + @hydra_runner(config_name="InferenceConfig", schema=InferenceConfig) def main(cfg): @@ -243,7 +249,10 @@ def extract_audio_features(manifest_filepath: str, cfg: DictConfig, record_fn: C out_dir.mkdir(parents=True, exist_ok=True) torch.set_grad_enabled(False) - vad_model = EncDecClassificationModel.from_pretrained("vad_multilingual_marblenet") + if cfg.vad_model: + vad_model = init_frame_vad_model(cfg.vad_model) + else: + vad_model = EncDecClassificationModel.from_pretrained("vad_multilingual_marblenet") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") vad_model = vad_model.to(device) vad_model.eval() @@ -256,6 +265,7 @@ def extract_audio_features(manifest_filepath: str, cfg: DictConfig, record_fn: C 'labels': ['infer',], 'num_workers': cfg.num_workers, 'shuffle': False, + 'normalize_audio_db': cfg.normalize_audio_db, } ) @@ -284,7 +294,13 @@ def extract_audio_features(manifest_filepath: str, cfg: DictConfig, record_fn: C def run_vad_inference(manifest_filepath: str, cfg: DictConfig, record_fn: Callable) -> str: logging.info("Start VAD inference pipeline...") - vad_model = init_vad_model(cfg.vad_model) + if cfg.vad_type == "segment": + vad_model = init_vad_model(cfg.vad_model) + elif cfg.vad_type == "frame": + vad_model = init_frame_vad_model(cfg.vad_model) + else: + raise ValueError(f"Unknown VAD type: {cfg.vad_type}, supported types: ['segment', 'frame']") + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") vad_model = vad_model.to(device) vad_model.eval() @@ -358,8 +374,6 @@ def run_vad_inference(manifest_filepath: str, cfg: DictConfig, record_fn: Callab logging.info(f"Generating segment tables with postprocessing params: {vad_cfg.vad.parameters.postprocessing}") segment_dir_name = "vad_rttm" for key, val in vad_cfg.vad.parameters.postprocessing.items(): - if key == "use_rttm": - continue segment_dir_name = segment_dir_name + "-" + str(key) + str(val) segment_dir = Path(cfg.output_dir) / Path(segment_dir_name) @@ -368,13 +382,13 @@ def run_vad_inference(manifest_filepath: str, cfg: DictConfig, record_fn: Callab else: segment_dir.mkdir(parents=True) t0 = time.time() - vad_cfg.vad.parameters.postprocessing.use_rttm = True segment_dir = generate_vad_segment_table( vad_pred_dir=pred_dir, postprocessing_params=vad_cfg.vad.parameters.postprocessing, frame_length_in_sec=frame_length_in_sec, num_workers=cfg.num_workers, out_dir=segment_dir, + use_rttm=True, ) t1 = time.time() logging.info(f"Time elapsed: {t1 - t0: .2f} seconds") @@ -432,9 +446,14 @@ def generate_vad_frame_pred( with record_fn("vad_infer_other"): probs = torch.softmax(log_probs, dim=-1) + if len(probs.shape) == 3: + # squeeze the batch dimension, since batch size is 1 + probs = probs.squeeze(0) # [1,T,C] -> [T,C] pred = probs[:, 1] - if status[i] == 'start': + if window_length_in_sec == 0: + to_save = pred + elif status[i] == 'start': to_save = pred[:-trunc] elif status[i] == 'next': to_save = pred[trunc:-trunc_l] @@ -443,11 +462,13 @@ def generate_vad_frame_pred( else: to_save = pred + to_save = to_save.cpu().tolist() all_len += len(to_save) + outpath = os.path.join(out_dir, data[i] + ".frame") with open(outpath, "a", encoding='utf-8') as fout: - for f in range(len(to_save)): - fout.write('{0:0.4f}\n'.format(to_save[f])) + for p in to_save: + fout.write(f'{p:0.4f}\n') del test_batch if status[i] == 'end' or status[i] == 'single': @@ -476,18 +497,30 @@ def run_asr_inference(manifest_filepath, cfg, record_fn) -> str: # Setup decoding strategy decode_function = None - if hasattr(asr_model, 'change_decoding_strategy'): - # Check if ctc or rnnt model - if hasattr(asr_model, 'joint'): # RNNT model + decoder_type = cfg.get("decoder_type", None) + if not hasattr(asr_model, 'change_decoding_strategy'): + raise ValueError(f"ASR model {cfg.asr_model} does not support decoding strategy.") + if decoder_type is not None: # Hybrid model + if decoder_type == 'rnnt': cfg.rnnt_decoding.fused_batch_size = -1 cfg.rnnt_decoding.compute_langs = cfg.compute_langs - asr_model.change_decoding_strategy(cfg.rnnt_decoding) + asr_model.change_decoding_strategy(cfg.rnnt_decoding, decoder_type=decoder_type) decode_function = asr_model.decoding.rnnt_decoder_predictions_tensor - else: - asr_model.change_decoding_strategy(cfg.ctc_decoding) + elif decoder_type == 'ctc': + asr_model.change_decoding_strategy(cfg.ctc_decoding, decoder_type=decoder_type) decode_function = asr_model.decoding.ctc_decoder_predictions_tensor + else: + raise ValueError( + f"Unknown decoder type for hybrid model: {decoder_type}, supported types: ['rnnt', 'ctc']" + ) + elif hasattr(asr_model, 'joint'): # RNNT model + cfg.rnnt_decoding.fused_batch_size = -1 + cfg.rnnt_decoding.compute_langs = cfg.compute_langs + asr_model.change_decoding_strategy(cfg.rnnt_decoding) + decode_function = asr_model.decoding.rnnt_decoder_predictions_tensor else: - raise ValueError(f"Only support CTC or RNNT models that have `change_decoding_strategy()` implemented.") + asr_model.change_decoding_strategy(cfg.ctc_decoding) + decode_function = asr_model.decoding.ctc_decoder_predictions_tensor # Compute output filename if cfg.output_filename is None: @@ -499,7 +532,10 @@ def run_asr_inference(manifest_filepath, cfg, record_fn) -> str: if cfg.use_rttm: vad_tag = Path(manifest_filepath).stem vad_tag = vad_tag[len("temp_manifest_vad_rttm_") :] - tag += f"-mask{cfg.feat_mask_val}-{vad_tag}" + if cfg.rttm_mode == "mask": + tag += f"-mask{cfg.feat_mask_val}-{vad_tag}" + else: + tag += f"-dropframe-{vad_tag}" cfg.output_filename = cfg.manifest_filepath.replace('.json', f'-{Path(cfg.asr_model).stem}-{tag}.json') cfg.output_filename = Path(cfg.output_dir) / Path(cfg.output_filename).name @@ -509,10 +545,12 @@ def run_asr_inference(manifest_filepath, cfg, record_fn) -> str: "normalize": cfg.normalize, "normalize_type": cfg.normalize_type, "use_rttm": cfg.use_rttm, + "rttm_mode": cfg.rttm_mode, "feat_mask_val": cfg.feat_mask_val, "frame_unit_time_secs": cfg.frame_unit_time_secs, } - logging.info(f"use_rttm = {cfg.use_rttm}") + logging.info(f"use_rttm = {cfg.use_rttm}, rttm_mode = {cfg.rttm_mode}, feat_mask_val = {cfg.feat_mask_val}") + if hasattr(asr_model, "tokenizer"): dataset = feature_to_text_dataset.get_bpe_dataset(config=data_config, tokenizer=asr_model.tokenizer) else: @@ -542,10 +580,13 @@ def run_asr_inference(manifest_filepath, cfg, record_fn) -> str: processed_signal=test_batch[0].to(device), processed_signal_length=test_batch[1].to(device), ) + with record_fn("asr_infer_other"): logits, logits_len = outputs[0], outputs[1] current_hypotheses, all_hyp = decode_function(logits, logits_len, return_hypotheses=False,) + if isinstance(current_hypotheses, tuple) and len(current_hypotheses) == 2: + current_hypotheses = current_hypotheses[0] # handle RNNT output hypotheses += current_hypotheses if all_hyp is not None: @@ -562,9 +603,16 @@ def run_asr_inference(manifest_filepath, cfg, record_fn) -> str: # Save output to manifest input_manifest_data = read_manifest(manifest_filepath) manifest_data = read_manifest(cfg.manifest_filepath) + + if "text" not in manifest_data[0]: + has_groundtruth = False + else: + has_groundtruth = True + groundtruth = [] for i in range(len(manifest_data)): - groundtruth.append(manifest_data[i]["text"]) + if has_groundtruth: + groundtruth.append(manifest_data[i]["text"]) manifest_data[i]["pred_text"] = hypotheses[i] manifest_data[i]["feature_file"] = input_manifest_data[i]["feature_file"] if "rttm_file" in input_manifest_data[i]: @@ -572,19 +620,19 @@ def run_asr_inference(manifest_filepath, cfg, record_fn) -> str: write_manifest(cfg.output_filename, manifest_data) - if cfg.use_pure_noise: + if not has_groundtruth: hypotheses = " ".join(hypotheses) words = hypotheses.split() chars = "".join(words) logging.info("-----------------------------------------") - logging.info(f"Number of hallucinated characters={len(chars)}") - logging.info(f"Number of hallucinated words={len(words)}") - logging.info(f"Concatenated predictions: {hypotheses}") + logging.info(f"Number of generated characters={len(chars)}") + logging.info(f"Number of generated words={len(words)}") logging.info("-----------------------------------------") else: wer_score = word_error_rate(hypotheses=hypotheses, references=groundtruth) + cer_score = word_error_rate(hypotheses=hypotheses, references=groundtruth, use_cer=True) logging.info("-----------------------------------------") - logging.info(f"WER={wer_score*100:.2f}") + logging.info(f"WER={wer_score:.4f}, CER={cer_score:.4f}") logging.info("-----------------------------------------") logging.info(f"ASR output saved at {cfg.output_filename}") diff --git a/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml b/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml index 8c9ef7fffaf5..842c04777c72 100644 --- a/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml +++ b/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml @@ -15,7 +15,7 @@ vad: parameters: # Parameters not tuned on large datasets, please use default parameters with caution normalize_audio_db: null # set to non null value to normalize RMS DB of audio before preprocessing window_length_in_sec: 0.0 # window length in sec for VAD context input, must be 0 for frame-VAD - shift_length_in_sec: 0.02 # frame-length in seconds for frame-VAD + shift_length_in_sec: 0.02 # frame-length in seconds for frame-VAD, must be 0.02 for the pretrained NeMo VAD model smoothing: False # Deprecated for Frame-VAD. false or type of smoothing method (eg: median, mean) overlap: 0.875 # Deprecated for Frame-VAD. overlap ratio for overlapped mean/median smoothing filter. If smoothing=False, ignore this value. postprocessing: diff --git a/examples/asr/speech_classification/frame_vad_infer.py b/examples/asr/speech_classification/frame_vad_infer.py index 9c8e57b0773d..56eb7584e3db 100644 --- a/examples/asr/speech_classification/frame_vad_infer.py +++ b/examples/asr/speech_classification/frame_vad_infer.py @@ -21,7 +21,11 @@ ## Usage: python frame_vad_infer.py \ --config-path="../conf/vad" --config-name="frame_vad_infer_postprocess" \ - dataset= + dataset= + +The manifest json file should have the following format (each line is a Python dictionary): +{"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000} +{"audio_filepath": "/path/to/audio_file2", "offset": 0, "duration": 10000} """ import os diff --git a/examples/asr/speech_classification/speech_to_frame_label.py b/examples/asr/speech_classification/speech_to_frame_label.py index 04cc77afda44..3289845ec3d3 100644 --- a/examples/asr/speech_classification/speech_to_frame_label.py +++ b/examples/asr/speech_classification/speech_to_frame_label.py @@ -28,6 +28,15 @@ strategy="ddp" \ trainer.max_epochs=200 ``` + +The input manifest must be a manifest json file, where each line is a Python dictionary. The fields ["audio_filepath", "offset", "duration", "label"] are required. An example of a manifest file is: +``` +{"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000, "label": "0 1 0 0 1"} +{"audio_filepath": "/path/to/audio_file2", "offset": 0, "duration": 10000, "text": "0 0 0 1 1 1 1 0 0"} +``` +For example, if you have a 1s audio file, you'll need to have 50 frame labels in the manifest entry like "0 0 0 0 1 1 0 1 .... 0 1". +However, shorter label strings are also supported for smaller file sizes. For example, you can prepare the `label` in 40ms frame, and the model will properly repeat the label for each 20ms frame. + """ import pytorch_lightning as pl diff --git a/nemo/collections/asr/data/feature_to_label.py b/nemo/collections/asr/data/feature_to_label.py index 673f50374581..058d0157fcbd 100644 --- a/nemo/collections/asr/data/feature_to_label.py +++ b/nemo/collections/asr/data/feature_to_label.py @@ -262,14 +262,20 @@ class FeatureToLabelDataset(Dataset): Dataset that loads tensors via a json file containing paths to feature files and their labels. Each new line is a different sample. Example below: and their target labels. JSON files should be of the following format: - {"feature_filepath": "/path/to/audio_feature.pt", "label": "1"} \ + {"feature_filepath": "/path/to/audio_feature.pt", "label": "1"} ... {"feature_filepath": "/path/to/audio_feature.pt", "label": "0"} Args: - manifest_filepath (str): Dataset parameter. Path to JSON containing data. - labels (Optional[list]): Dataset parameter. List of unique labels collected from all samples. + manifest_filepath (str): Path to JSON containing data. + labels (Optional[list]): List of unique labels collected from all samples. augmentor (Optional): feature augmentation - + window_length_in_sec (float): Window length in seconds. + shift_length_in_sec (float): Shift length in seconds. + is_regression_task (bool): if True, the labels are treated as for a regression task. + cal_labels_occurrence (bool): if True, the labels occurrence will be calculated. + zero_spec_db_val (float): Value to replace non-speech signals in log-melspectrogram. + min_duration (float): Minimum duration of the audio file in seconds. + max_duration (float): Maximum duration of the audio file in seconds. """ ZERO_LEVEL_SPEC_DB_VAL = -16.635 # Log-Melspectrogram value for zero signal @@ -296,22 +302,53 @@ def __init__( augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None, window_length_in_sec: float = 0.63, shift_length_in_sec: float = 0.01, + is_regression_task: bool = False, + cal_labels_occurrence: Optional[bool] = False, + zero_spec_db_val: float = -16.635, + min_duration: Optional[float] = None, + max_duration: Optional[float] = None, ): super().__init__() self.window_length_in_sec = window_length_in_sec self.shift_length_in_sec = shift_length_in_sec - self.collection = collections.ASRFeatureLabel(manifests_files=manifest_filepath.split(','),) + self.zero_spec_db_val = zero_spec_db_val + + if isinstance(manifest_filepath, str): + manifest_filepath = manifest_filepath.split(',') + + self.collection = collections.ASRFeatureLabel( + manifests_files=manifest_filepath, + is_regression_task=is_regression_task, + cal_labels_occurrence=cal_labels_occurrence, + min_duration=min_duration, + max_duration=max_duration, + ) self.feature_loader = ExternalFeatureLoader(augmentor=augmentor) self.labels = labels if labels else self.collection.uniq_labels - self.label2id, self.id2label = {}, {} - for label_id, label in enumerate(self.labels): - self.label2id[label] = label_id - self.id2label[label_id] = label + self.is_regression_task = is_regression_task - for idx in range(len(self.labels[:5])): - logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx])) + if not is_regression_task: + self.labels = labels if labels else self.collection.uniq_labels + self.num_classes = len(self.labels) if self.labels is not None else 1 + self.label2id, self.id2label = {}, {} + self.id2occurrence, self.labels_occurrence = {}, [] + + for label_id, label in enumerate(self.labels): + self.label2id[label] = label_id + self.id2label[label_id] = label + if cal_labels_occurrence: + self.id2occurrence[label_id] = self.collection.labels_occurrence[label] + + if cal_labels_occurrence: + self.labels_occurrence = [self.id2occurrence[k] for k in sorted(self.id2occurrence)] + + for idx in range(len(self.labels[:5])): + logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx])) + else: + self.labels = [] + self.num_classes = 1 def __len__(self): return len(self.collection) @@ -328,9 +365,133 @@ def __getitem__(self, index): return f, fl, t, tl def _collate_fn(self, batch): - return _audio_feature_collate_fn(batch, self.ZERO_LEVEL_SPEC_DB_VAL, 0) + return _audio_feature_collate_fn(batch, self.zero_spec_db_val, 0) def _vad_segment_collate_fn(self, batch): return _vad_feature_segment_collate_fn( batch, self.window_length_in_sec, self.shift_length_in_sec, self.FRAME_UNIT_TIME_SECS ) + + +class FeatureToMultiLabelDataset(Dataset): + """ + Dataset that loads tensors via a json file containing paths to feature files and their labels. + Each new line is a different sample. Example below: + and their target labels. JSON files should be of the following format: + {"feature_filepath": "/path/to/audio_feature.pt", "label": "1 1 0 0 1"} + ... + {"feature_filepath": "/path/to/audio_feature.pt", "label": "0 1 0 0"} + Args: + manifest_filepath (str): Path to JSON containing data. + labels (Optional[list]): List of unique labels collected from all samples. + augmentor (Optional): feature augmentation + delimiter (str): delimiter to split the labels. + is_regression_task (bool): if True, the labels are treated as for a regression task. + cal_labels_occurrence (bool): if True, the labels occurrence will be calculated. + zero_spec_db_val (float): Value to replace non-speech signals in log-melspectrogram. + min_duration (float): Minimum duration of the audio file in seconds. + max_duration (float): Maximum duration of the audio file in seconds. + """ + + ZERO_LEVEL_SPEC_DB_VAL = -16.635 # Log-Melspectrogram value for zero signal + + @property + def output_types(self) -> Optional[Dict[str, NeuralType]]: + """Returns definitions of module output ports. + """ + output_types = { + 'audio_feat': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), + 'feat_length': NeuralType(tuple('B'), LengthsType()), + 'labels': NeuralType(('B', 'T'), LabelsType()), + 'labels_length': NeuralType(tuple('B'), LengthsType()), + } + + return output_types + + def __init__( + self, + *, + manifest_filepath: str, + labels: List[str] = None, + augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None, + delimiter: Optional[str] = None, + is_regression_task: bool = False, + cal_labels_occurrence: Optional[bool] = False, + zero_spec_db_val: float = -16.635, + min_duration: Optional[float] = None, + max_duration: Optional[float] = None, + ): + super().__init__() + self.delimiter = delimiter + self.zero_spec_db_val = zero_spec_db_val + + if isinstance(manifest_filepath, str): + manifest_filepath = manifest_filepath.split(',') + + self.collection = collections.ASRFeatureLabel( + manifests_files=manifest_filepath, + is_regression_task=is_regression_task, + cal_labels_occurrence=cal_labels_occurrence, + delimiter=delimiter, + min_duration=min_duration, + max_duration=max_duration, + ) + + self.is_regression_task = is_regression_task + self.feature_loader = ExternalFeatureLoader(augmentor=augmentor) + self.labels = labels if labels else self.collection.uniq_labels + + self.label2id, self.id2label = {}, {} + if not is_regression_task: + self.labels = labels if labels else self._get_label_set() + self.num_classes = len(self.labels) if self.labels is not None else 1 + self.label2id, self.id2label = {}, {} + for label_id, label in enumerate(self.labels): + self.label2id[label] = label_id + self.id2label[label_id] = label + if cal_labels_occurrence: + self.id2occurrence[label_id] = self.collection.labels_occurrence[label] + self.labels_occurrence.append(self.id2occurrence[label_id]) + + for idx in range(len(self.labels[:5])): + logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx])) + else: + self.labels = [] + self.num_classes = 1 + + def _get_label_set(self): + labels = [] + for sample in self.collection: + label_str = sample.label + if label_str: + label_str_list = label_str.split(self.delimiter) if self.delimiter else label_str.split() + labels.extend(label_str_list) + return sorted(set(labels)) + + def _label_str_to_tensor(self, label_str: str): + labels = label_str.split(self.delimiter) if self.delimiter else label_str.split() + + if self.is_regression_task: + labels = [float(s) for s in labels] + labels = torch.tensor(labels).float() + else: + labels = [self.label2id[s] for s in labels] + labels = torch.tensor(labels).long() + return labels + + def __len__(self): + return len(self.collection) + + def __getitem__(self, index): + sample = self.collection[index] + + features = self.feature_loader.process(sample.feature_file) + f, fl = features, torch.tensor(features.shape[1]).long() + + t = self._label_str_to_tensor(sample.label) + tl = torch.tensor(t.size(0)).long() + + return f, fl, t, tl + + def _collate_fn(self, batch): + return _audio_feature_collate_fn(batch, self.zero_spec_db_val, 0) diff --git a/nemo/collections/asr/data/feature_to_label_dataset.py b/nemo/collections/asr/data/feature_to_label_dataset.py index dabe06aa62bb..08803f43ce8d 100644 --- a/nemo/collections/asr/data/feature_to_label_dataset.py +++ b/nemo/collections/asr/data/feature_to_label_dataset.py @@ -34,13 +34,35 @@ def get_feature_seq_speakerlabel_dataset( def get_feature_label_dataset( - config: dict, augmentor: Optional['AudioAugmentor'] = None + config: dict, augmentor: Optional['FeatureAugmentor'] = None ) -> feature_to_label.FeatureToLabelDataset: dataset = feature_to_label.FeatureToLabelDataset( manifest_filepath=config['manifest_filepath'], labels=config['labels'], augmentor=augmentor, window_length_in_sec=config.get("window_length_in_sec", 0.63), - shift_length_in_sec=config.get("shift_length_in_sec", 0.01), + shift_length_in_sec=config.get("shift_length_in_sec", 0.08), + is_regression_task=config.get("is_regression_task", False), + cal_labels_occurrence=config.get("cal_labels_occurrence", False), + zero_spec_db_val=config.get("zero_spec_db_val", -16.635), + max_duration=config.get('max_duration', None), + min_duration=config.get('min_duration', None), + ) + return dataset + + +def get_feature_multi_label_dataset( + config: dict, augmentor: Optional['FeatureAugmentor'] = None +) -> feature_to_label.FeatureToMultiLabelDataset: + dataset = feature_to_label.FeatureToMultiLabelDataset( + manifest_filepath=config['manifest_filepath'], + labels=config['labels'], + augmentor=augmentor, + delimiter=config.get('delimiter', None), + is_regression_task=config.get("is_regression_task", False), + cal_labels_occurrence=config.get("cal_labels_occurrence", False), + zero_spec_db_val=config.get("zero_spec_db_val", -16.635), + max_duration=config.get('max_duration', None), + min_duration=config.get('min_duration', None), ) return dataset diff --git a/nemo/collections/asr/data/feature_to_text.py b/nemo/collections/asr/data/feature_to_text.py index eaec7b3afba5..a7e295051ae8 100644 --- a/nemo/collections/asr/data/feature_to_text.py +++ b/nemo/collections/asr/data/feature_to_text.py @@ -86,30 +86,32 @@ class _FeatureTextDataset(Dataset): {"feature_filepath": "/path/to/audio_feature.pt", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt": "utterance_id", "ctm_utt": "en_4156", "side": "A"} Args: - manifest_filepath: Path to manifest json as described above. Can be comma-separated paths. + manifest_filepath (str): Path to manifest json as described above. Can be comma-separated paths. parser: Str for a language specific preprocessor or a callable. - normalize: whether and where to normalize feature, must be one of [None, "post_norm", "pre_norm"] + normalize (bool): whether and where to normalize feature, must be one of [None, "post_norm", "pre_norm"] normalize_type (Union[str, dict]): how to normalize feature, see `nemo.collections.asr.parts.preprocessing.features.normalize_batch` - use_rttm: whether to use RTTM files if there is any, default to False + use_rttm (bool): whether to use RTTM files if there is any, default to False + rttm_mode (str): how to use RTTM files, must be one of ['mask', 'drop'], default to 'mask' + feat_min_len (int): minimum length of feature when rttm_mode=deop, default to 4. feat_mask_val (Optional[float]): value used to mask features with RTTM files, default to None to use zero mel-spectralgram frame_unit_time_secs (float): time in seconds for each frame sample_rate (int): Sample rate to resample loaded audio to int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor object used to augment loaded - audio - max_duration: If audio exceeds this length, do not include in dataset - min_duration: If audio is less than this length, do not include in dataset - max_utts: Limit number of utterances - trim: whether or not to trim silence. Defaults to False - bos_id: Id of beginning of sequence symbol to append if not None - eos_id: Id of end of sequence symbol to append if not None - pad_id: Id of pad symbol. Defaults to 0 + augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor object used to augment loaded audio + max_duration (float): If audio exceeds this length, do not include in dataset + min_duration (float): If audio is less than this length, do not include in dataset + max_utts (int): Limit number of utterances + trim (bool): whether or not to trim silence. Defaults to False + bos_id (int): Id of beginning of sequence symbol to append if not None + eos_id (int): Id of end of sequence symbol to append if not None + pad_id (int): Id of pad symbol. Defaults to 0 return_sample_id (bool): whether to return the sample_id as a part of each sample channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. """ ZERO_LEVEL_SPEC_DB_VAL = -16.635 # Log-Melspectrogram value for zero signal NORM_MODES = ["pre_norm", "post_norm"] + RTTM_MODES = ["mask", "drop"] @property def output_types(self) -> Optional[Dict[str, NeuralType]]: @@ -130,6 +132,8 @@ def __init__( normalize: Optional[str] = "post_norm", normalize_type: Union[str, dict] = "per_feature", use_rttm: bool = False, + rttm_mode: str = "mask", + feat_min_len: int = 4, feat_mask_val: Optional[float] = None, frame_unit_time_secs: float = 0.01, sample_rate: Optional[int] = 16000, @@ -151,6 +155,11 @@ def __init__( self.normalize = normalize self.normalize_type = normalize_type self.use_rttm = use_rttm + self.rttm_mode = rttm_mode + if self.use_rttm and self.rttm_mode not in self.RTTM_MODES: + raise ValueError(f"`rttm_mode` must be one of {self.RTTM_MODES}, got `{rttm_mode}` instead") + + self.feat_min_len = feat_min_len if feat_mask_val is not None: self.feat_mask_val = feat_mask_val elif normalize == "pre_norm": @@ -197,17 +206,18 @@ def __getitem__(self, index): # Feature normalization if self.normalize is None: if self.use_rttm and sample.rttm_file: - f = self.mask_features_from_rttm(f, offset, sample.rttm_file, self.feat_mask_val) + f = self.process_features_with_rttm(f, offset, sample.rttm_file, self.feat_mask_val) elif self.normalize == "post_norm": # (Optional) Masking based on RTTM file if self.use_rttm and sample.rttm_file: - f = self.mask_features_from_rttm(f, offset, sample.rttm_file, self.feat_mask_val) + f = self.process_features_with_rttm(f, offset, sample.rttm_file, self.feat_mask_val) + f = self.normalize_feature(f) else: # pre-norm f = self.normalize_feature(f) # (Optional) Masking based on RTTM file if self.use_rttm and sample.rttm_file: - f = self.mask_features_from_rttm(f, offset, sample.rttm_file, self.feat_mask_val) + f = self.process_features_with_rttm(f, offset, sample.rttm_file, self.feat_mask_val) if self.return_sample_id: output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long(), index @@ -216,17 +226,32 @@ def __getitem__(self, index): return output - def mask_features_from_rttm(self, features, offset, rttm_file, mask_val): + def process_features_with_rttm(self, features, offset, rttm_file, mask_val): segments = load_speech_segments_from_rttm(rttm_file) - sid = 0 + new_features = features.clone() + sid, fid = 0, 0 for i in range(features.size(1)): t = offset + i * self.frame_unit_time_secs while sid < len(segments) - 1 and segments[sid][1] < t: sid += 1 if segments[sid][1] == 0 or t < segments[sid][0] or t > segments[sid][1]: - features[:, i] = mask_val - - return features + # not in speech segment + if self.rttm_mode == "drop": + # drop the frame + continue + else: + # mask the frame with specified value + new_features[:, i] = mask_val + fid += 1 + else: + # in speech segment + new_features[:, fid] = features[:, i] + fid += 1 + + if fid < self.feat_min_len and self.rttm_mode == "drop": + new_features[:, : self.feat_min_len] = mask_val + return new_features[:, : self.feat_min_len] + return new_features[:, :fid] def __len__(self): return len(self.manifest_processor.collection) @@ -259,12 +284,14 @@ class FeatureToCharDataset(_FeatureTextDataset): "utterance_id", "ctm_utt": "en_4156", "side": "A"} Args: - manifest_filepath: Path to manifest json as described above. Can + manifest_filepath (str): Path to manifest json as described above. Can be comma-separated paths. - labels: String containing all the possible characters to map to - normalize: how to normalize feature, must be one of [None, "post_norm", "pre_norm"] + labels (str): String containing all the possible characters to map to + normalize (str): how to normalize feature, must be one of [None, "post_norm", "pre_norm"] normalize_type (Union[str, dict]): how to normalize feature, see `nemo.collections.asr.parts.preprocessing.features.normalize_batch` - use_rttm: whether to use RTTM files if there is any, default to False + use_rttm (bool): whether to use RTTM files if there is any, default to False + rttm_mode (str): how to use RTTM files, must be one of ['mask', 'drop'], default to 'mask' + feat_min_len (int): minimum length of feature, default to 4 feat_mask_val (Optional[float]): value used to mask features with RTTM files, default to None to use zero mel-spectralgram frame_unit_time_secs: time in seconds for each frame sample_rate (int): Sample rate to resample loaded audio to @@ -290,6 +317,8 @@ def __init__( normalize: Optional[str] = "post_norm", normalize_type: Union[str, dict] = "per_feature", use_rttm: bool = False, + rttm_mode: str = "mask", + feat_min_len: int = 4, feat_mask_val: Optional[float] = None, frame_unit_time_secs: float = 0.01, sample_rate: Optional[int] = 16000, @@ -319,6 +348,8 @@ def __init__( normalize=normalize, normalize_type=normalize_type, use_rttm=use_rttm, + rttm_mode=rttm_mode, + feat_min_len=feat_min_len, feat_mask_val=feat_mask_val, frame_unit_time_secs=frame_unit_time_secs, sample_rate=sample_rate, @@ -352,14 +383,16 @@ class FeatureToBPEDataset(_FeatureTextDataset): the manifest. Args: - manifest_filepath: Path to manifest json as described above. Can + manifest_filepath (str): Path to manifest json as described above. Can be comma-separated paths. tokenizer: A subclass of the Tokenizer wrapper found in the common collection, nemo.collections.common.tokenizers.TokenizerSpec. ASR Models support a subset of all available tokenizers. - normalize: how to normalize feature, must be one of [None, "post_norm", "pre_norm"] + normalize (str): how to normalize feature, must be one of [None, "post_norm", "pre_norm"] normalize_type (Union[str, dict]): how to normalize feature, see `nemo.collections.asr.parts.preprocessing.features.normalize_batch` - use_rttm: whether to use RTTM files if there is any, default to False + use_rttm (bool): whether to use RTTM files if there is any, default to False + rttm_mode (str): how to use RTTM files, must be one of ['mask', 'drop'], default to 'mask' + feat_min_len (int): minimum length of feature, default to 4 feat_mask_val (Optional[float]): value used to mask features with RTTM files, default to None to use zero mel-spectralgram frame_unit_time_secs: time in seconds for each frame sample_rate (int): Sample rate to resample loaded audio to @@ -384,6 +417,8 @@ def __init__( normalize: Optional[str] = "post_norm", normalize_type: Union[str, dict] = "per_feature", use_rttm: bool = False, + rttm_mode: str = "mask", + feat_min_len: int = 4, feat_mask_val: Optional[float] = None, frame_unit_time_secs: float = 0.01, sample_rate: Optional[int] = 16000, @@ -435,6 +470,8 @@ def __call__(self, *args): normalize=normalize, normalize_type=normalize_type, use_rttm=use_rttm, + rttm_mode=rttm_mode, + feat_min_len=feat_min_len, feat_mask_val=feat_mask_val, frame_unit_time_secs=frame_unit_time_secs, sample_rate=sample_rate, diff --git a/nemo/collections/asr/data/feature_to_text_dataset.py b/nemo/collections/asr/data/feature_to_text_dataset.py index 7efd3be3cd24..6bc03bc0b33d 100644 --- a/nemo/collections/asr/data/feature_to_text_dataset.py +++ b/nemo/collections/asr/data/feature_to_text_dataset.py @@ -38,6 +38,8 @@ def get_char_dataset(config: dict, augmentor: Optional['FeatureAugmentor'] = Non normalize=config.get('normalize', 'post_norm'), normalize_type=config.get('normalize_type', 'per_feature'), use_rttm=config.get('use_rttm', False), + rttm_mode=config.get('rttm_mode', 'mask'), + feat_min_len=config.get('feat_min_len', 4), feat_mask_val=config.get('feat_mask_val', None), frame_unit_time_secs=config.get('frame_unit_time_secs', 0.01), sample_rate=config.get('sample_rate', 16000), @@ -75,6 +77,8 @@ def get_bpe_dataset( normalize=config.get('normalize', 'post_norm'), normalize_type=config.get('normalize_type', 'per_feature'), use_rttm=config.get('use_rttm', False), + rttm_mode=config.get('rttm_mode', 'mask'), + feat_min_len=config.get('feat_min_len', 4), feat_mask_val=config.get('feat_mask_val', None), frame_unit_time_secs=config.get('frame_unit_time_secs', 0.01), sample_rate=config.get('sample_rate', 16000), diff --git a/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb b/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb index b38fab2c98bf..1445afe9e381 100644 --- a/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb +++ b/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb @@ -15,7 +15,9 @@ "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies.\n", "5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect\n", - "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n", + "\n", + "\n", + "NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n", "\"\"\"\n", "# If you're using Google Colab and not running locally, run this cell.\n", "\n", @@ -364,6 +366,16 @@ "metric_value = word_error_rate(hypotheses=predicted_text, references=ground_truth_text, use_cer=False)\n", "print(f\"WER is {metric_value}\")" ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Further Reading\n", + "\n", + "There are two ways to incorporate VAD into ASR pipeline. The first strategy is to drop the frames that are predicted as `non-speech` by VAD, as already discussed in this tutorial. The second strategy is to keep all the frames and mask the `non-speech` frames with zero-signal values. Also, instead of using segment-VAD as shown in this tutorial, we can use frame-VAD model for faster inference and better accuracy. For more information, please refer to the two scripts [speech_to_text_with_vad.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr_vad/speech_to_text_with_vad.py)." + ] } ], "metadata": { diff --git a/tutorials/asr/Voice_Activity_Detection.ipynb b/tutorials/asr/Voice_Activity_Detection.ipynb index b8013822c486..8b95698c71e8 100644 --- a/tutorials/asr/Voice_Activity_Detection.ipynb +++ b/tutorials/asr/Voice_Activity_Detection.ipynb @@ -18,7 +18,9 @@ "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", "4. Run this cell to set up dependencies.\n", - "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n", + "\n", + "\n", + "NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n", "\"\"\"\n", "# If you're using Google Colab and not running locally, run this cell.\n", "\n", @@ -1124,6 +1126,25 @@ "# Inference and more\n", "If you are interested in **pretrained** model and **streaming inference**, please have a look at our [VAD inference tutorial](https://github.com/NVIDIA/NeMo/blob/stable/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb) and script [vad_infer.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr/speech_classification/vad_infer.py)\n" ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Frame-VAD: More Effective and Efficient VAD for More Fine-grained Timestamps\n", + "\n", + "In this notebook, we are using the segment-VAD model, which predicts a single label for each short segment (0.63s), which is not optimal for some applications that require very precise timestamps. \n", + "\n", + "To get more precise timestamps, we can use a frame-VAD model, which predicts a label for each input frame (20ms). To prepare manifest for frame-VAD, you'll need to have `label` field in each manifest entry, which is a string of labels for each frame. For example, if you have a 1s audio file, you'll need to have 50 frame labels in the manifest entry like \"0 0 0 0 1 1 0 1 .... 0 1\".\n", + "However, shorter label strings are also supported for smaller file sizes. For example, you can prepare the `label` in 40ms frame, and the model will properly repeat the label for each 20ms frame. \n", + "\n", + "The Frame-VAD model shares the same MarbleNet architecture as the segment-VAD model, but with a different input/output resolution and loss function. The frame-VAD model is trained with more data than segment-VAD and achieves better performance, as shown in the [NGC model card](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/vad_multilingual_frame_marblenet). \n", + "\n", + "During inference, since frame-VAD model doesn't require splicing input into overlapping segments, it is more efficient than segment-VAD model, with 8x less GPU memory consumption.\n", + "\n", + "For more information on the frame-VAD model, please refer to the [model class](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/models/classification_models.py#L840). For training and running inference on frame-VAD, please refer to [speech_to_frame_label.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr/speech_classification/speech_to_frame_label.py) and [frame_vad_infer.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr/speech_classification/frame_vad_infer.py)." + ] } ], "metadata": { From 1e4845c057c5176d19bbece698eeee65268ef8d8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 14 Jun 2023 16:18:01 -0600 Subject: [PATCH 040/123] Add API docs for NeMo Megatron (#6850) (#6864) * add model pretraining and customization classes * fix * test width * increase middle pane width * add modules and datasets * remove global in t5 dataset s and fix formatting in megatron base model --------- Signed-off-by: ericharper Co-authored-by: Eric Harper --- docs/source/_static/css/custom.css | 2 +- docs/source/conf.py | 5 +- docs/source/nlp/api.rst | 193 +++++++++++------- .../language_modeling/megatron_base_model.py | 25 +-- 4 files changed, 135 insertions(+), 90 deletions(-) diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css index da134a02d86a..cf0ad0ff2d7f 100644 --- a/docs/source/_static/css/custom.css +++ b/docs/source/_static/css/custom.css @@ -255,7 +255,7 @@ article ul { } } -@media (min-width: 1400px) { +@media (min-width: none) { body { font-size: 18px; } diff --git a/docs/source/conf.py b/docs/source/conf.py index a78ba3528048..0765f8940ab0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -28,7 +28,6 @@ sys.path.insert(0, os.path.abspath("../..")) sys.path.insert(0, os.path.abspath("../../nemo")) -sys.path.insert(0, os.path.abspath("../../nemo_text_processing")) from package_info import __version__ @@ -47,7 +46,6 @@ 'hydra', # hydra-core in requirements, hydra during import 'dateutil', # part of core python 'transformers.tokenization_bert', # has ., troublesome for this regex - 'megatron', # megatron-lm in requirements, megatron in import 'sklearn', # scikit_learn in requirements, sklearn in import 'nemo_text_processing.inverse_text_normalization', # Not installed automatically 'nemo_text_processing.text_normalization', # Not installed automatically @@ -55,10 +53,13 @@ 'torchmetrics', # inherited from PTL 'lightning_utilities', # inherited from PTL 'apex', + 'megatron.core', + 'transformer_engine', 'joblib', # inherited from optional code 'IPython', 'ipadic', 'psutil', + 'regex', ] _skipped_autodoc_mock_imports = ['wrapt', 'numpy'] diff --git a/docs/source/nlp/api.rst b/docs/source/nlp/api.rst index 46efb0851d4e..7c6971a68d05 100755 --- a/docs/source/nlp/api.rst +++ b/docs/source/nlp/api.rst @@ -1,99 +1,142 @@ -NeMo NLP collection API +NeMo Megatron API ======================= -Model Classes -------------- +Pretraining Model Classes +------------------------- + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_base_model.MegatronBaseModel + :show-inheritance: + :no-members: + :members: __init__, configure_optimizers + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel + :show-inheritance: + :no-members: + :members: generate, training_step, validation_step, build_train_valid_test_datasets, setup, on_save_checkpoint, on_load_checkpoint + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_bert_model.MegatronBertModel + :show-inheritance: + :no-members: + :members: training_step, validation_step, build_train_valid_test_datasets, build_LDDL_data, setup, on_save_checkpoint, on_load_checkpoint + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_bart_model.MegatronBARTModel + :show-inheritance: + :no-members: + :members: training_step, validation_step, build_train_valid_test_datasets, setup, on_save_checkpoint, on_load_checkpoint + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_retrieval_model.MegatronRetrievalModel + :show-inheritance: + :no-members: + :members: generate, training_step, validation_step, build_train_valid_test_datasets, setup + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_model.MegatronT5Model + :show-inheritance: + :no-members: + :members: complete, encode, decode, add_special_tokens_to_tokenizer, training_step, validation_step, build_train_valid_test_datasets, setup + +Customization Model Classes +--------------------------- + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model.MegatronGPTSFTModel + :show-inheritance: + :no-members: + :members: generate, training_step, validation_step, build_train_valid_test_datasets, setup + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model.MegatronGPTAdapterLearningModel + :show-inheritance: + :no-members: + :members: __init__, state_dict, generate, training_step, validation_step, build_train_valid_test_datasets, setup + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model.MegatronGPTInfusedAdapterModel + :show-inheritance: + :no-members: + :members: __init__, state_dict, generate, training_step, validation_step, build_train_valid_test_datasets, setup + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model.MegatronGPTPromptLearningModel + :show-inheritance: + :no-members: + :members: built_virtual_prompt_dataset, generate, training_step, validation_step, build_train_valid_test_datasets, setup + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5AdapterLearningModel + :show-inheritance: + :no-members: + :members: __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5AdapterLearningModel + :show-inheritance: + :no-members: + :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5InfusedAdapterModel + :show-inheritance: + :no-members: + :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup -.. autoclass:: nemo.collections.nlp.models.TextClassificationModel - :show-inheritance: - :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact, classifytext +Modules +------- -.. autoclass:: nemo.collections.nlp.models.GLUEModel - :show-inheritance: - :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact +.. autoclass:: nemo.collections.nlp.modules.common.megatron.module.MegatronModule + :show-inheritance: -.. autoclass:: nemo.collections.nlp.models.PunctuationCapitalizationModel - :show-inheritance: - :members: +.. autoclass:: nemo.collections.nlp.modules.common.megatron.module.Float16Module + :show-inheritance: -.. autoclass:: nemo.collections.nlp.models.TokenClassificationModel - :show-inheritance: - :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact - -.. autoclass:: nemo.collections.nlp.models.QAModel - :show-inheritance: - :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, inference, validation_epoch_end, test_epoch_end -.. autoclass:: nemo.collections.nlp.models.DuplexTaggerModel - :show-inheritance: - :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, inference, validation_epoch_end, test_epoch_end +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron.gpt_model.GPTModel + :show-inheritance: + :no-members: + :members: forward -.. autoclass:: nemo.collections.nlp.models.DuplexDecoderModel - :show-inheritance: - :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, inference, validation_epoch_end, test_epoch_end +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron.bert_model.BertModel + :show-inheritance: + :no-members: + :members: forward -.. autoclass:: nemo.collections.nlp.models.BERTLMModel - :show-inheritance: - :members: setup_training_data, setup_optimization +.. autoclass:: nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder.MegatronTokenLevelEncoderDecoderModule + :show-inheritance: + :no-members: + :members: forward -Modules -------- +.. autoclass:: nemo.collections.nlp.modules.common.megatron.retrieval_token_level_encoder_decoder.MegatronRetrievalTokenLevelEncoderDecoderModule + :show-inheritance: + :no-members: + :members: forward -.. autoclass:: nemo.collections.nlp.modules.BertModule - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.nlp.modules.AlbertEncoder - :show-inheritance: - :members: -.. autoclass:: nemo.collections.nlp.modules.BertEncoder - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.nlp.modules.DistilBertEncoder - :show-inheritance: - :members: +Datasets +-------- -.. autoclass:: nemo.collections.nlp.modules.RobertaEncoder - :show-inheritance: - :members: +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset.BlendableDataset + :show-inheritance: -.. autoclass:: nemo.collections.nlp.modules.SequenceClassifier - :show-inheritance: - :members: +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset.GPTDataset + :show-inheritance: -.. autoclass:: nemo.collections.nlp.modules.SequenceRegression - :show-inheritance: - :members: +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset.MockGPTDataset + :show-inheritance: -.. autoclass:: nemo.collections.nlp.modules.SequenceTokenClassifier - :show-inheritance: - :members: +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.bert_dataset.BertDataset + :show-inheritance: -.. autofunction:: nemo.collections.nlp.modules.get_lm_model +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.base_prompt_learning_dataset.BasePromptLearningDataset + :show-inheritance: -.. autofunction:: nemo.collections.nlp.modules.get_pretrained_lm_models_list +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset + :show-inheritance: -.. autofunction:: nemo.collections.nlp.modules.common.megatron.get_megatron_lm_models_list +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTChatDataset + :show-inheritance: -Datasets --------- +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.retro_dataset.RETRODataset + :show-inheritance: -.. autoclass:: nemo.collections.nlp.data.token_classification.punctuation_capitalization_dataset.BertPunctuationCapitalizationDataset - :show-inheritance: - :members: - :special-members: __getitem__ +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.t5_dataset.T5Dataset + :show-inheritance: + :exclude-members: MAX_SEQ_LENGTH_DELTA -.. autofunction:: nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset.create_tarred_dataset +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.t5_prompt_learning_dataset.T5PromptLearningDataset + :show-inheritance: -.. autoclass:: nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset.BertPunctuationCapitalizationTarredDataset - :show-inheritance: - :members: - :special-members: __iter__ - :exclude-members: reinforce_type +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.ul2_dataset.UL2Dataset + :show-inheritance: -.. autoclass:: nemo.collections.nlp.data.token_classification.punctuation_capitalization_infer_dataset.BertPunctuationCapitalizationInferDataset - :show-inheritance: - :members: - :special-members: __getitem__ diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 7be679376175..ceddc1dca4d4 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -61,18 +61,19 @@ class MegatronBaseModel(NLPModel): """ - Megatron base class - It does the following things: - 1. Initialize the model parallel for nemo given the model parallel parameters. - 2. Turn on all the nvidia optimizations. - 3. If `cfg.tokenizer` is available, it loads the tokenizer and pad the vocab to the correct size for tensor model parallelism. - 4. If using distributed optimizer, configure to be compatible with - O2-level optimizations and/or model parallelism. - 5. Perform gradient clipping: `grad_clip_pl_default` triggers the - PyTorch Lightning default implementation, `with_distributed_adam` - triggers the distributed optimizer's implementation, - `megatron_amp_o2` triggers gradient clipping on the main grads, - and otherwise gradient clipping is performed on the model grads. + Megatron base class. All NeMo Megatron models inherit from this class. + + - Initialize the model parallel world for nemo. + - Turn on all of the nvidia optimizations. + - If `cfg.tokenizer` is available, it loads the tokenizer and pad the vocab to the + correct size for tensor model parallelism. + - If using distributed optimizer, configure to be compatible + with O2 level optimizations and/or model parallelism. + - Perform gradient clipping: `grad_clip_pl_default` triggers + the PyTorch Lightning default implementation, `with_distributed_adam` triggers + the distributed optimizer's implementation, `megatron_amp_o2` triggers gradient clipping on the main grads, + and otherwise gradient clipping is performed on the model grads. + """ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): From 72132a200d915e88f461bbcb5db1e2bd54d8ed93 Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Thu, 15 Jun 2023 01:57:56 -0400 Subject: [PATCH 041/123] Update transcribe_utils.py (#6865) fix ctc decoding for hybrid model in partial transcribe Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> --- nemo/collections/asr/parts/utils/transcribe_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index 60f936306d05..7cf957a7cec0 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -388,7 +388,7 @@ def transcribe_partial_audio( decode_function = ( asr_model.decoding.rnnt_decoder_predictions_tensor if decoder_type == 'rnnt' - else asr_model.decoding.ctc_decoder_predictions_tensor + else asr_model.ctc_decoding.ctc_decoder_predictions_tensor ) elif hasattr(asr_model, 'joint'): # RNNT model decode_function = asr_model.decoding.rnnt_decoder_predictions_tensor From a6c8cce5573ece94f96c444900e53edf5e7b59d7 Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Sat, 17 Jun 2023 09:05:51 -0600 Subject: [PATCH 042/123] Import Enum for chatbot component (#6877) * import Enum Signed-off-by: ericharper * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * make web server import conditional Signed-off-by: ericharper --------- Signed-off-by: ericharper Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- examples/nlp/language_modeling/megatron_gpt_eval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index b33cdefc6df2..af1657b44d7b 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -23,7 +23,6 @@ from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer from nemo.collections.nlp.modules.common.text_generation_utils import generate from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam @@ -297,6 +296,8 @@ def main(cfg) -> None: # Third method of running text generation, use inference server if cfg.server: + from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo + if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0: if cfg.web_server: if cfg.chat: From 6f2035bf985053bdda269f4df3b9338f5b58bfd5 Mon Sep 17 00:00:00 2001 From: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Date: Sat, 17 Jun 2023 16:39:50 -0700 Subject: [PATCH 043/123] [bugfix] avoid the random shuffle of phoneme and tone tokens. (#6855) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --- nemo/collections/tts/g2p/models/zh_cn_pinyin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/tts/g2p/models/zh_cn_pinyin.py b/nemo/collections/tts/g2p/models/zh_cn_pinyin.py index 35a22f6ba118..aab57c925c82 100644 --- a/nemo/collections/tts/g2p/models/zh_cn_pinyin.py +++ b/nemo/collections/tts/g2p/models/zh_cn_pinyin.py @@ -82,11 +82,11 @@ def __init__( if isinstance(phoneme_dict, str) or isinstance(phoneme_dict, pathlib.Path) else phoneme_dict ) - self.phoneme_list = list({pron for prons in phoneme_dict.values() for pron in prons}) + self.phoneme_list = sorted({pron for prons in phoneme_dict.values() for pron in prons}) # tones self.tone_dict = {str(x): tone_prefix + str(x) for x in range(1, 6)} - self.tone_list = list(self.tone_dict.values()) + self.tone_list = sorted(self.tone_dict.values()) # ascii letters self.ascii_letter_dict = { From 990c764d133adc99a34dac1469c4e4ed35d03813 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 18 Jun 2023 15:49:03 -0600 Subject: [PATCH 044/123] update mcore version (#6875) (#6876) Signed-off-by: ericharper Co-authored-by: Eric Harper --- README.rst | 2 +- requirements/requirements_nlp.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 863b279b2be8..6742eb1f07d4 100644 --- a/README.rst +++ b/README.rst @@ -263,7 +263,7 @@ packaging is also needed: .. code-block:: bash - pip install -y packaging + pip install packaging Transformer Engine diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 1ff4c444c2bf..582862361a22 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -12,7 +12,7 @@ inflect jieba markdown2 matplotlib>=3.3.2 -megatron_core==0.1.0 +megatron_core==0.2.0 nltk>=3.6.5 opencc pangu From 3aac7958ebc0936296d6fa8328d2d7f51793187c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 18 Jun 2023 15:50:41 -0600 Subject: [PATCH 045/123] Add trainer.validate example for GPT (#6794) (#6822) * add trainer.validate example * clean up white space * add mbs and gbs to the config --------- Signed-off-by: ericharper Co-authored-by: Eric Harper --- .../conf/megatron_gpt_validate_config.yaml | 22 +++ .../megatron_gpt_validate.py | 155 ++++++++++++++++++ .../language_modeling/megatron_gpt_model.py | 21 +-- 3 files changed, 188 insertions(+), 10 deletions(-) create mode 100644 examples/nlp/language_modeling/conf/megatron_gpt_validate_config.yaml create mode 100644 examples/nlp/language_modeling/megatron_gpt_validate.py diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_validate_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_validate_config.yaml new file mode 100644 index 000000000000..39b0c7ed2176 --- /dev/null +++ b/examples/nlp/language_modeling/conf/megatron_gpt_validate_config.yaml @@ -0,0 +1,22 @@ +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + logger: False # logger provided by exp_manager + precision: 16 # 16, 32, or bf16 + log_every_n_steps: 1 + limit_val_batches: 10 + limit_test_batches: 50 + max_steps: 100 # needed to setup dataloaders + max_epochs: null + replace_sampler_ddp: False + +tensor_model_parallel_size: ??? # should be set the same as the pretrained model that is being restored from +pipeline_model_parallel_size: ??? # should be set the same as the pretrained model that is being restored from +micro_batch_size: null # limited by GPU memory, defaults to pretrained model config +global_batch_size: null # will use more micro batches to reach global batch size, defaults to pretrained model config +virtual_pipeline_model_parallel_size: null +gpt_model_file: null # GPT nemo file path +checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training +checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading +hparams_file: null # model configuration file, only used for PTL checkpoint loading diff --git a/examples/nlp/language_modeling/megatron_gpt_validate.py b/examples/nlp/language_modeling/megatron_gpt_validate.py new file mode 100644 index 000000000000..b5a61e627a14 --- /dev/null +++ b/examples/nlp/language_modeling/megatron_gpt_validate.py @@ -0,0 +1,155 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +from omegaconf import OmegaConf, open_dict +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel +from nemo.collections.nlp.parts.nlp_overrides import ( + MegatronHalfPrecisionPlugin, + NLPDDPStrategy, + NLPSaveRestoreConnector, + PipelineMixedPrecisionPlugin, +) +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.app_state import AppState +from nemo.utils.model_utils import inject_model_parallel_rank + +""" Example script showing how to run validation on a MegatronGPT model. + + Sample usage: + + From nemo model: + + python megatron_gpt_validate.py \ + trainer.devices=4 \ + trainer.num_nodes=1 \ + trainer.limit_val_batches=10 \ + trainer.max_steps=100 \ + tensor_model_parallel_size=1 \ + pipeline_model_parallel_size=4 \ + trainer.precision=bf16 \ + gpt_model_file=/path/to/megatron_gpt_tp_1_pp4.nemo + + from PTL checkpoint: + python megatron_gpt_validate.py \ + trainer.devices=4 \ + trainer.num_nodes=1 \ + trainer.limit_val_batches=10 \ + trainer.max_steps=100 \ + tensor_model_parallel_size=1 \ + pipeline_model_parallel_size=4 \ + virtual_pipeline_model_parallel_size=4 \ + trainer.precision=bf16 \ + checkpoint_dir='/path/to/experiment/checkpoints' \ + checkpoint_name='megatron_gpt--val_loss=7.78-step=100-consumed_samples=6336.0-last.ckpt' \ + hparams_file='/path/to/experiment/hparams.yaml + +""" + + +def modify_pretrained_cfg(pretrained_cfg, trainer, cfg): + with open_dict(pretrained_cfg): + OmegaConf.set_struct(pretrained_cfg, True) + pretrained_cfg.sequence_parallel = False + pretrained_cfg.activations_checkpoint_granularity = None + pretrained_cfg.activations_checkpoint_method = None + pretrained_cfg.precision = trainer.precision + if cfg.micro_batch_size is not None: + pretrained_cfg.micro_batch_size = cfg.micro_batch_size + if cfg.global_batch_size is not None: + pretrained_cfg.global_batch_size = cfg.global_batch_size + if trainer.precision == "16": + pretrained_cfg.megatron_amp_O2 = False + return pretrained_cfg + + +@hydra_runner(config_path="conf", config_name="megatron_gpt_validate_config") +def main(cfg) -> None: + + trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) + + assert ( + cfg.trainer.devices * cfg.trainer.num_nodes + == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size + ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" + + if cfg.gpt_model_file: + logging.info(f"Restoring model from {cfg.gpt_model_file}") + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(cfg.gpt_model_file): + save_restore_connector.model_extracted_dir = cfg.gpt_model_file + + pretrained_cfg = MegatronGPTModel.restore_from( + restore_path=cfg.gpt_model_file, + trainer=trainer, + return_config=True, + save_restore_connector=save_restore_connector, + ) + pretrained_cfg = modify_pretrained_cfg(pretrained_cfg, trainer, cfg) + model = MegatronGPTModel.restore_from( + restore_path=cfg.gpt_model_file, + trainer=trainer, + override_config_path=pretrained_cfg, + save_restore_connector=save_restore_connector, + map_location=f'cuda:{trainer.local_rank}', # map_location is needed for converted models + ) + elif cfg.checkpoint_dir: + logging.info( + f"Restoring model from checkpoint_dir: {cfg.checkpoint_dir} with checkpoint name: {cfg.checkpoint_name}" + ) + app_state = AppState() + if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: + app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size + app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size + app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size + app_state.virtual_pipeline_model_parallel_size = cfg.virtual_pipeline_model_parallel_size + ( + app_state.tensor_model_parallel_rank, + app_state.pipeline_model_parallel_rank, + app_state.model_parallel_size, + app_state.data_parallel_size, + app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, + ) = fake_initialize_model_parallel( + world_size=app_state.model_parallel_size, + rank=trainer.global_rank, + tensor_model_parallel_size_=cfg.tensor_model_parallel_size, + pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, + virtual_pipeline_model_parallel_size_=cfg.virtual_pipeline_model_parallel_size, + ) + checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)) + pretrained_cfg = OmegaConf.load(cfg.hparams_file) + pretrained_cfg = modify_pretrained_cfg(pretrained_cfg.cfg, trainer, cfg) + with tempfile.NamedTemporaryFile(suffix='.yaml') as f: + OmegaConf.save(config=pretrained_cfg, f=f.name) + model = MegatronGPTModel.load_from_checkpoint( + checkpoint_path=checkpoint_path, trainer=trainer, hparams_file=f.name, + ) + else: + raise ValueError("need at least a nemo file or checkpoint dir") + + logging.info("\n\n************** Model configuration ***********") + logging.info(f'\n{OmegaConf.to_yaml(model.cfg)}') + + trainer.validate(model=model) + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 853c637eb3b3..1ce153bcf0fb 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1016,17 +1016,18 @@ def setup(self, stage=None): self.setup_validation_data(self.cfg.data) self.setup_test_data(self.cfg.data) - # when using pipeline model parallel the final stage need to initialize word embeddings - if parallel_state.get_pipeline_model_parallel_world_size() > 1: - if isinstance(self.model, list): - for i, module in enumerate(self.model): - parallel_state.set_virtual_pipeline_model_parallel_rank(i) + if stage == 'fit': + # when using pipeline model parallel the final stage need to initialize word embeddings + if parallel_state.get_pipeline_model_parallel_world_size() > 1: + if isinstance(self.model, list): + for i, module in enumerate(self.model): + parallel_state.set_virtual_pipeline_model_parallel_rank(i) + if self.cfg.get('share_embeddings_and_output_weights', True): + module.sync_initial_word_embeddings() + parallel_state.set_virtual_pipeline_model_parallel_rank(0) + else: if self.cfg.get('share_embeddings_and_output_weights', True): - module.sync_initial_word_embeddings() - parallel_state.set_virtual_pipeline_model_parallel_rank(0) - else: - if self.cfg.get('share_embeddings_and_output_weights', True): - self.model.sync_initial_word_embeddings() + self.model.sync_initial_word_embeddings() if self.cfg.get('transformer_engine', False): self.setup_transformer_engine_tp_groups() From fc8407112dc7fac30ddd2038f56c9c229a3fb3fe Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Sun, 18 Jun 2023 22:46:02 -0700 Subject: [PATCH 046/123] typo fix from #6666 (#6882) * typo fix Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- nemo/collections/nlp/modules/common/text_generation_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py index 6417f887c0cd..d84d16efb5ba 100644 --- a/nemo/collections/nlp/modules/common/text_generation_utils.py +++ b/nemo/collections/nlp/modules/common/text_generation_utils.py @@ -105,7 +105,8 @@ def megatron_gpt_generate(model, inputs, tokenizer, length_params, sampling_para greedy=sampling_params['use_greedy'], repetition_penalty=sampling_params['repetition_penalty'], min_tokens_to_generate=length_params['min_length'], - compute_attention_mask=sampling_params.get("compute_attention_mask", True) ** strategy_args, + compute_attention_mask=sampling_params.get("compute_attention_mask", True), + **strategy_args, ) compute_prob_response = get_computeprob_response(tokenizer, response, inputs) return compute_prob_response From e418f71ed562362e8ba2a9aa8249f65390b6e019 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Tue, 20 Jun 2023 19:52:18 +0300 Subject: [PATCH 047/123] Fix k2 build topo helper (#6887) Fix k2 build topo helper: reassign modified labels attribute Signed-off-by: Vladimir Bataev --- nemo/collections/asr/parts/k2/topologies.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nemo/collections/asr/parts/k2/topologies.py b/nemo/collections/asr/parts/k2/topologies.py index c892b2643332..a3b6fcf0fef7 100644 --- a/nemo/collections/asr/parts/k2/topologies.py +++ b/nemo/collections/asr/parts/k2/topologies.py @@ -46,9 +46,11 @@ def build_topo(name: str, tokens: List[int], blank_num: int, with_self_loops: bo else: raise ValueError(f"Unknown topo name: {name}") if blank_num != 0: - blank_mask = ans.labels == 0 - ans.labels[(ans.labels != -1) & (ans.labels <= blank_num)] -= 1 - ans.labels[blank_mask] = blank_num + labels = ans.labels + blank_mask = labels == 0 + labels[(labels != -1) & (labels <= blank_num)] -= 1 + labels[blank_mask] = blank_num + ans.labels = labels # force update ans.labels property to notify FSA about modifications, required by k2 ans = k2.arc_sort(ans) return ans From 63d9b2c906d080ef412c7fcd20ccbcaa88404154 Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Tue, 20 Jun 2023 16:24:23 -0600 Subject: [PATCH 048/123] Update container for import action (#6883) * update container Signed-off-by: ericharper * run import tests in parallel, isntall Cython Signed-off-by: ericharper * fix typo Signed-off-by: ericharper * remove redundant comment Signed-off-by: ericharper * fix more typos Signed-off-by: ericharper * upload and download wheel Signed-off-by: ericharper * fix typos Signed-off-by: ericharper * fix typos Signed-off-by: ericharper * test order Signed-off-by: ericharper * remove name Signed-off-by: ericharper * fix indent Signed-off-by: ericharper * add names back Signed-off-by: ericharper * don't upload or download just build in parallel Signed-off-by: ericharper --------- Signed-off-by: ericharper --- .github/workflows/import-test.yml | 62 ++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/.github/workflows/import-test.yml b/.github/workflows/import-test.yml index 5fc34347710d..e9b10e1e34af 100644 --- a/.github/workflows/import-test.yml +++ b/.github/workflows/import-test.yml @@ -6,25 +6,24 @@ on: paths: - "**" +# Check https://hub.docker.com/r/pytorch/pytorch/tags for latest tags jobs: - ci-import-check: - runs-on: ubuntu-latest - # Check https://hub.docker.com/r/pytorch/pytorch/tags for latest tags + test-asr-imports: + runs-on: ubuntu-latest container: - image: pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime - + image: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime steps: - - uses: actions/checkout@v2 - + - name: Checkout repo + uses: actions/checkout@v2 - name: Update base dependencies run: | apt-get update && apt-get install -y build-essential apt-get install -y libsndfile1 make - - name: Install nemo dependencies id: nemo-wheel run: | + pip install Cython # install test requirements pip install -r requirements/requirements_test.txt # Build nemo as a wheel @@ -33,7 +32,6 @@ jobs: # Preserve wheel location DIST_FILE=$(find ./dist -name "*.whl" | head -n 1) echo "::set-output name=DIST_FILE::${DIST_FILE}" - - name: Test ASR Domain Imports run: | # Install NeMo Domain @@ -43,6 +41,29 @@ jobs: # Uninstall NeMo pip uninstall -y nemo_toolkit + test-tts-imports: + runs-on: ubuntu-latest + container: + image: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime + steps: + - name: Checkout repo + uses: actions/checkout@v2 + - name: Update base dependencies + run: | + apt-get update && apt-get install -y build-essential + apt-get install -y libsndfile1 make + - name: Install nemo dependencies + id: nemo-wheel + run: | + pip install Cython + # install test requirements + pip install -r requirements/requirements_test.txt + # Build nemo as a wheel + pip install build + python -m build --no-isolation --wheel + # Preserve wheel location + DIST_FILE=$(find ./dist -name "*.whl" | head -n 1) + echo "::set-output name=DIST_FILE::${DIST_FILE}" - name: Test TTS Domain Imports run: | # Install NeMo Domain @@ -52,6 +73,29 @@ jobs: # Uninstall NeMo pip uninstall -y nemo_toolkit + test-nlp-imports: + runs-on: ubuntu-latest + container: + image: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime + steps: + - name: Checkout repo + uses: actions/checkout@v2 + - name: Update base dependencies + run: | + apt-get update && apt-get install -y build-essential + apt-get install -y libsndfile1 make + - name: Install nemo dependencies + id: nemo-wheel + run: | + pip install Cython + # install test requirements + pip install -r requirements/requirements_test.txt + # Build nemo as a wheel + pip install build + python -m build --no-isolation --wheel + # Preserve wheel location + DIST_FILE=$(find ./dist -name "*.whl" | head -n 1) + echo "::set-output name=DIST_FILE::${DIST_FILE}" - name: Test NLP Domain Imports run: | # Install NeMo Domain From 24837af3cd0b7cd5a79df1bb3dc4cc87f0c5a438 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Wed, 21 Jun 2023 18:30:54 +0300 Subject: [PATCH 049/123] removed unnecessary print (#6884) Signed-off-by: Dmytro Pykhtar --- .../nlp/models/language_modeling/megatron_gpt_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 1ce153bcf0fb..c4bfdbbad143 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -518,7 +518,6 @@ def training_step(self, dataloader_iter, batch_idx): if self.rampup_batch_size: num_microbatch_calculator = apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR current_global_batch_size = num_microbatch_calculator.current_global_batch_size - logging.info(current_global_batch_size) # do validation and save the checkpoint when gbs is changed if self.prev_global_batch_size != current_global_batch_size and self.prev_global_batch_size: self.trainer.should_stop = True From 328bbbbe378507c0756c5399168770d3adb9309c Mon Sep 17 00:00:00 2001 From: mikolajblaz Date: Wed, 21 Jun 2023 20:12:11 +0200 Subject: [PATCH 050/123] Fix destructor for delayed mmap dataset case (#6703) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mikołaj Błaż Co-authored-by: Eric Harper --- .../nlp/data/language_modeling/megatron/indexed_dataset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/indexed_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/indexed_dataset.py index 0fffb5b64a23..fe71e7f78019 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/indexed_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/indexed_dataset.py @@ -513,6 +513,8 @@ def _do_init(self, path, skip_warmup=True, delay_data_mmap=False): self._create_data_mmap(skip_warmup) else: logging.info(" skip creating data numpy buffer of mmap...") + self._bin_buffer_mmap = None + self._bin_buffer = None def _create_data_mmap(self, skip_warmup): if not skip_warmup: @@ -524,7 +526,8 @@ def _create_data_mmap(self, skip_warmup): self._bin_buffer = memoryview(self._bin_buffer_mmap) def __del__(self): - self._bin_buffer_mmap._mmap.close() + if self._bin_buffer_mmap is not None: + self._bin_buffer_mmap._mmap.close() del self._bin_buffer_mmap del self._index From 07ea9715db22f97b1b4758e88f00dd3f87ad4296 Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Wed, 21 Jun 2023 13:55:47 -0700 Subject: [PATCH 051/123] removed some tests (#6900) * removed some tests Signed-off-by: arendu * updated Signed-off-by: arendu --------- Signed-off-by: arendu --- Jenkinsfile | 718 ++++++++++++++++++++++++++-------------------------- 1 file changed, 363 insertions(+), 355 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index d335378173f0..8a151d34c336 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -624,96 +624,97 @@ pipeline { } } - stage('L2: Megatron T5 Adapter PP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel{ - stage('T5 Adapter tuning & inference TP=1 PP=2') { - steps { - sh "python examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py \ - --config-name=megatron_t5_adapter_tuning_config \ - name='test_tp1_pp2' \ - exp_manager.exp_dir='examples/adapter_tuning' \ - trainer.devices=2 \ - trainer.max_steps=1 \ - trainer.val_check_interval=1 \ - trainer.max_epochs=null \ - model.data.num_workers=1 \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=2 \ - model.language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp1_pp2.nemo' \ - model.existing_tasks=[] \ - model.new_tasks=['rte'] \ - model.data.train_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ - model.data.validation_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ - model.global_batch_size=4" - sh "python examples/nlp/language_modeling/tuning/megatron_t5_adapter_eval.py \ - --config-name=megatron_t5_adapter_inference \ - adapter_model_file='examples/adapter_tuning/test_tp1_pp2.nemo' \ - language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp1_pp2.nemo' \ - trainer.devices=2 \ - data.num_workers=1 \ - tensor_model_parallel_size=1 \ - pipeline_model_parallel_size=2 \ - data.global_batch_size=2 \ - data.micro_batch_size=2 \ - data.test_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ - pred_file_path='examples/adapter_tuning/test_tp1_pp2/preds.txt'" - sh "rm -rf examples/adapter_tuning/test_tp1_pp2.nemo" - sh "rm -rf examples/adapter_tuning/test_tp1_pp2" - } - } - } - } - stage('L2: Megatron T5 Adapter TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel{ - stage('T5 Adapter tuning & inference TP=2 PP=1') { - steps { - sh "python examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py \ - --config-name=megatron_t5_adapter_tuning_config \ - name='test_tp2_pp1' \ - exp_manager.exp_dir='examples/adapter_tuning' \ - trainer.devices=2 \ - trainer.max_steps=1 \ - trainer.val_check_interval=1 \ - trainer.max_epochs=null \ - model.data.num_workers=1 \ - model.tensor_model_parallel_size=2 \ - model.language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \ - model.existing_tasks=[] \ - model.new_tasks=['rte'] \ - model.data.train_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ - model.data.validation_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ - model.global_batch_size=4" - sh "python examples/nlp/language_modeling/tuning/megatron_t5_adapter_eval.py \ - --config-name=megatron_t5_adapter_inference \ - adapter_model_file='examples/adapter_tuning/test_tp2_pp1.nemo' \ - language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \ - trainer.devices=2 \ - tensor_model_parallel_size=2 \ - data.global_batch_size=2 \ - data.micro_batch_size=2 \ - data.num_workers=1 \ - data.test_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ - pred_file_path='examples/adapter_tuning/test_tp2_pp1/preds.txt'" - sh "rm -rf examples/adapter_tuning/test_tp2_pp1.nemo" - sh "rm -rf examples/adapter_tuning/test_tp2_pp1" - } - } - } - } + // commented out temporarily to save time on github ci + //stage('L2: Megatron T5 Adapter PP=2') { + // when { + // anyOf { + // branch 'main' + // changeRequest target: 'main' + // } + // } + // failFast true + // parallel{ + // stage('T5 Adapter tuning & inference TP=1 PP=2') { + // steps { + // sh "python examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py \ + // --config-name=megatron_t5_adapter_tuning_config \ + // name='test_tp1_pp2' \ + // exp_manager.exp_dir='examples/adapter_tuning' \ + // trainer.devices=2 \ + // trainer.max_steps=1 \ + // trainer.val_check_interval=1 \ + // trainer.max_epochs=null \ + // model.data.num_workers=1 \ + // model.tensor_model_parallel_size=1 \ + // model.pipeline_model_parallel_size=2 \ + // model.language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp1_pp2.nemo' \ + // model.existing_tasks=[] \ + // model.new_tasks=['rte'] \ + // model.data.train_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ + // model.data.validation_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ + // model.global_batch_size=4" + // sh "python examples/nlp/language_modeling/tuning/megatron_t5_adapter_eval.py \ + // --config-name=megatron_t5_adapter_inference \ + // adapter_model_file='examples/adapter_tuning/test_tp1_pp2.nemo' \ + // language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp1_pp2.nemo' \ + // trainer.devices=2 \ + // data.num_workers=1 \ + // tensor_model_parallel_size=1 \ + // pipeline_model_parallel_size=2 \ + // data.global_batch_size=2 \ + // data.micro_batch_size=2 \ + // data.test_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ + // pred_file_path='examples/adapter_tuning/test_tp1_pp2/preds.txt'" + // sh "rm -rf examples/adapter_tuning/test_tp1_pp2.nemo" + // sh "rm -rf examples/adapter_tuning/test_tp1_pp2" + // } + // } + // } + //} + //stage('L2: Megatron T5 Adapter TP=2') { + // when { + // anyOf { + // branch 'main' + // changeRequest target: 'main' + // } + // } + // failFast true + // parallel{ + // stage('T5 Adapter tuning & inference TP=2 PP=1') { + // steps { + // sh "python examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py \ + // --config-name=megatron_t5_adapter_tuning_config \ + // name='test_tp2_pp1' \ + // exp_manager.exp_dir='examples/adapter_tuning' \ + // trainer.devices=2 \ + // trainer.max_steps=1 \ + // trainer.val_check_interval=1 \ + // trainer.max_epochs=null \ + // model.data.num_workers=1 \ + // model.tensor_model_parallel_size=2 \ + // model.language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \ + // model.existing_tasks=[] \ + // model.new_tasks=['rte'] \ + // model.data.train_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ + // model.data.validation_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ + // model.global_batch_size=4" + // sh "python examples/nlp/language_modeling/tuning/megatron_t5_adapter_eval.py \ + // --config-name=megatron_t5_adapter_inference \ + // adapter_model_file='examples/adapter_tuning/test_tp2_pp1.nemo' \ + // language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \ + // trainer.devices=2 \ + // tensor_model_parallel_size=2 \ + // data.global_batch_size=2 \ + // data.micro_batch_size=2 \ + // data.num_workers=1 \ + // data.test_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ + // pred_file_path='examples/adapter_tuning/test_tp2_pp1/preds.txt'" + // sh "rm -rf examples/adapter_tuning/test_tp2_pp1.nemo" + // sh "rm -rf examples/adapter_tuning/test_tp2_pp1" + // } + // } + // } + //} stage('L2: Megatron T5 IA3 PP=2') { when { anyOf { @@ -847,50 +848,51 @@ pipeline { } } } - stage('L2: Megatron GPT Adapter PP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel{ - stage('GPT Adapter tuning & inference TP=1 PP=2') { - steps { - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py \ - --config-name=megatron_gpt_adapter_tuning_config \ - name='test_tp1_pp2' \ - exp_manager.exp_dir='examples/adapter_tuning' \ - trainer.devices=2 \ - trainer.max_steps=1 \ - trainer.val_check_interval=1 \ - trainer.max_epochs=null \ - model.data.num_workers=1 \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=2 \ - model.language_model_path='/home/TestData/nlp/megatron_gpt/tiny/megatron_14m_gpt_tp1_pp2.nemo' \ - model.existing_tasks=[] \ - model.new_tasks=['rte'] \ - model.data.train_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ - model.data.validation_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ - model.global_batch_size=4" - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_adapter_eval.py \ - --config-name=megatron_gpt_adapter_inference \ - adapter_model_file='examples/adapter_tuning/test_tp1_pp2.nemo' \ - gpt_model_file='/home/TestData/nlp/megatron_gpt/tiny/megatron_14m_gpt_tp1_pp2.nemo' \ - inference.greedy=True \ - inference.add_BOS=False \ - trainer.devices=2 \ - num_workers=1 \ - tensor_model_parallel_size=2 \ - data_paths=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl']" - sh "rm -rf examples/adapter_tuning/test_tp1_pp2.nemo" - sh "rm -rf examples/adapter_tuning/test_tp1_pp2" - } - } - } - } + // commented out to save time on github ci @adithyare + //stage('L2: Megatron GPT Adapter PP=2') { + // when { + // anyOf { + // branch 'main' + // changeRequest target: 'main' + // } + // } + // failFast true + // parallel{ + // stage('GPT Adapter tuning & inference TP=1 PP=2') { + // steps { + // sh "python examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py \ + // --config-name=megatron_gpt_adapter_tuning_config \ + // name='test_tp1_pp2' \ + // exp_manager.exp_dir='examples/adapter_tuning' \ + // trainer.devices=2 \ + // trainer.max_steps=1 \ + // trainer.val_check_interval=1 \ + // trainer.max_epochs=null \ + // model.data.num_workers=1 \ + // model.tensor_model_parallel_size=1 \ + // model.pipeline_model_parallel_size=2 \ + // model.language_model_path='/home/TestData/nlp/megatron_gpt/tiny/megatron_14m_gpt_tp1_pp2.nemo' \ + // model.existing_tasks=[] \ + // model.new_tasks=['rte'] \ + // model.data.train_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ + // model.data.validation_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ + // model.global_batch_size=4" + // sh "python examples/nlp/language_modeling/tuning/megatron_gpt_adapter_eval.py \ + // --config-name=megatron_gpt_adapter_inference \ + // adapter_model_file='examples/adapter_tuning/test_tp1_pp2.nemo' \ + // gpt_model_file='/home/TestData/nlp/megatron_gpt/tiny/megatron_14m_gpt_tp1_pp2.nemo' \ + // inference.greedy=True \ + // inference.add_BOS=False \ + // trainer.devices=2 \ + // num_workers=1 \ + // tensor_model_parallel_size=2 \ + // data_paths=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl']" + // sh "rm -rf examples/adapter_tuning/test_tp1_pp2.nemo" + // sh "rm -rf examples/adapter_tuning/test_tp1_pp2" + // } + // } + // } + //} stage('L2: Speech Transcription') { when { anyOf { @@ -3278,43 +3280,44 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' model.activations_checkpoint_num_layers=1 \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=1 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=rope \ - model.rotary_percentage=0.5 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + // commented out to save time on github ci @adithyare + //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + //trainer.devices=2 \ + //trainer.accelerator=gpu \ + //trainer.log_every_n_steps=1 \ + //trainer.val_check_interval=2 \ + //trainer.limit_val_batches=1 \ + //trainer.accumulate_grad_batches=1 \ + //trainer.max_steps=6 \ + //trainer.precision=16 \ + //trainer.gradient_clip_val=1.0 \ + //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + //exp_manager.resume_if_exists=True \ + //model.tensor_model_parallel_size=2 \ + //model.optim.name=fused_adam \ + //model.optim.lr=2e-4 \ + //model.optim.sched.warmup_steps=2 \ + //model.optim.sched.constant_steps=2 \ + //model.optim.sched.min_lr=8e-5 \ + //model.max_position_embeddings=128 \ + //model.encoder_seq_length=128 \ + //model.data.seq_length=128 \ + //model.position_embedding_type=rope \ + //model.rotary_percentage=0.5 \ + //model.normalization=rmsnorm \ + //model.bias=False \ + //model.bias_activation_fusion=False \ + //model.bias_dropout_add_fusion=False \ + //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + //model.num_layers=8 \ + //model.hidden_size=256 \ + //model.num_attention_heads=8 \ + //model.activations_checkpoint_method='block' \ + //model.activations_checkpoint_granularity='full' \ + //model.activations_checkpoint_num_layers=1 \ + //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" } @@ -3365,44 +3368,45 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ model.use_flash_attention=True" - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=1 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=rope \ - model.rotary_percentage=0.5 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ - model.use_flash_attention=True" + // commented out to save time on github ci @adithyare + //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + //trainer.devices=2 \ + //trainer.accelerator=gpu \ + //trainer.log_every_n_steps=1 \ + //trainer.val_check_interval=2 \ + //trainer.limit_val_batches=1 \ + //trainer.accumulate_grad_batches=1 \ + //trainer.max_steps=6 \ + //trainer.precision=16 \ + //trainer.gradient_clip_val=1.0 \ + //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + //exp_manager.resume_if_exists=True \ + //model.tensor_model_parallel_size=2 \ + //model.optim.name=fused_adam \ + //model.optim.lr=2e-4 \ + //model.optim.sched.warmup_steps=2 \ + //model.optim.sched.constant_steps=2 \ + //model.optim.sched.min_lr=8e-5 \ + //model.max_position_embeddings=128 \ + //model.encoder_seq_length=128 \ + //model.data.seq_length=128 \ + //model.position_embedding_type=rope \ + //model.rotary_percentage=0.5 \ + //model.normalization=rmsnorm \ + //model.bias=False \ + //model.bias_activation_fusion=False \ + //model.bias_dropout_add_fusion=False \ + //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + //model.num_layers=8 \ + //model.hidden_size=256 \ + //model.num_attention_heads=8 \ + //model.activations_checkpoint_method='block' \ + //model.activations_checkpoint_granularity='full' \ + //model.activations_checkpoint_num_layers=1 \ + //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ + //model.use_flash_attention=True" sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" } @@ -3451,42 +3455,43 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' model.activations_checkpoint_num_layers=1 \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=1 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=alibi \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + // not testing resume functionality to save time on ci @adithyare + //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + //trainer.devices=2 \ + //trainer.accelerator=gpu \ + //trainer.log_every_n_steps=1 \ + //trainer.val_check_interval=2 \ + //trainer.limit_val_batches=1 \ + //trainer.accumulate_grad_batches=1 \ + //trainer.max_steps=6 \ + //trainer.precision=16 \ + //trainer.gradient_clip_val=1.0 \ + //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + //exp_manager.resume_if_exists=True \ + //model.tensor_model_parallel_size=2 \ + //model.optim.name=fused_adam \ + //model.optim.lr=2e-4 \ + //model.optim.sched.warmup_steps=2 \ + //model.optim.sched.constant_steps=2 \ + //model.optim.sched.min_lr=8e-5 \ + //model.max_position_embeddings=128 \ + //model.encoder_seq_length=128 \ + //model.data.seq_length=128 \ + //model.position_embedding_type=alibi \ + //model.normalization=rmsnorm \ + //model.bias=False \ + //model.bias_activation_fusion=False \ + //model.bias_dropout_add_fusion=False \ + //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + //model.num_layers=8 \ + //model.hidden_size=256 \ + //model.num_attention_heads=8 \ + //model.activations_checkpoint_method='block' \ + //model.activations_checkpoint_granularity='full' \ + //model.activations_checkpoint_num_layers=1 \ + //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" } @@ -3535,42 +3540,43 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' model.activations_checkpoint_num_layers=1 \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=1 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=kerple \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + // commented out to save time on github ci @adithyare + //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + //trainer.devices=2 \ + //trainer.accelerator=gpu \ + //trainer.log_every_n_steps=1 \ + //trainer.val_check_interval=2 \ + //trainer.limit_val_batches=1 \ + //trainer.accumulate_grad_batches=1 \ + //trainer.max_steps=6 \ + //trainer.precision=16 \ + //trainer.gradient_clip_val=1.0 \ + //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + //exp_manager.resume_if_exists=True \ + //model.tensor_model_parallel_size=2 \ + //model.optim.name=fused_adam \ + //model.optim.lr=2e-4 \ + //model.optim.sched.warmup_steps=2 \ + //model.optim.sched.constant_steps=2 \ + //model.optim.sched.min_lr=8e-5 \ + //model.max_position_embeddings=128 \ + //model.encoder_seq_length=128 \ + //model.data.seq_length=128 \ + //model.position_embedding_type=kerple \ + //model.normalization=rmsnorm \ + //model.bias=False \ + //model.bias_activation_fusion=False \ + //model.bias_dropout_add_fusion=False \ + //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + //model.num_layers=8 \ + //model.hidden_size=256 \ + //model.num_attention_heads=8 \ + //model.activations_checkpoint_method='block' \ + //model.activations_checkpoint_granularity='full' \ + //model.activations_checkpoint_num_layers=1 \ + //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" } @@ -3856,40 +3862,41 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' rm -rf examples/nlp/language_modeling/out.jsonl" } } - stage('L2: Megatron GPT Prompt Tuning TP1 PP1') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel{ - stage('GPT Prompt Learning TP=1 PP=1') { - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_prompt_learning.py \ - --config-name=megatron_gpt_prompt_learning_config \ - name='/home/TestData/nlp/prompt_learning/prompt_tuning_test' \ - trainer.devices=1 \ - trainer.max_steps=1 \ - trainer.val_check_interval=1 \ - trainer.max_epochs=null \ - model.data.num_workers=1 \ - model.tensor_model_parallel_size=1 \ - model.virtual_prompt_style='p-tuning' \ - model.p_tuning.encoder_type='embedding' \ - model.language_model_path='/home/TestData/nlp/megatron_gpt/tiny/megatron_14m_gpt_tp1_pp1.nemo' \ - model.existing_tasks=[] \ - model.new_tasks=['rte'] \ - model.data.train_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ - model.data.validation_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ - model.global_batch_size=4" - sh "rm -rf /home/TestData/nlp/prompt_learning/prompt_tuning_test" - sh "rm -rf /home/TestData/nlp/prompt_learning/prompt_tuning_test.nemo" - } - } - } - } + // commented out to save time we are testing tp>1 and pp>1 anyway. @adithyare + //stage('L2: Megatron GPT Prompt Tuning TP1 PP1') { + // when { + // anyOf { + // branch 'main' + // changeRequest target: 'main' + // } + // } + // failFast true + // parallel{ + // stage('GPT Prompt Learning TP=1 PP=1') { + // steps { + // sh "python examples/nlp/language_modeling/megatron_gpt_prompt_learning.py \ + // --config-name=megatron_gpt_prompt_learning_config \ + // name='/home/TestData/nlp/prompt_learning/prompt_tuning_test' \ + // trainer.devices=1 \ + // trainer.max_steps=1 \ + // trainer.val_check_interval=1 \ + // trainer.max_epochs=null \ + // model.data.num_workers=1 \ + // model.tensor_model_parallel_size=1 \ + // model.virtual_prompt_style='p-tuning' \ + // model.p_tuning.encoder_type='embedding' \ + // model.language_model_path='/home/TestData/nlp/megatron_gpt/tiny/megatron_14m_gpt_tp1_pp1.nemo' \ + // model.existing_tasks=[] \ + // model.new_tasks=['rte'] \ + // model.data.train_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ + // model.data.validation_ds=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl'] \ + // model.global_batch_size=4" + // sh "rm -rf /home/TestData/nlp/prompt_learning/prompt_tuning_test" + // sh "rm -rf /home/TestData/nlp/prompt_learning/prompt_tuning_test.nemo" + // } + // } + // } + //} stage('L2: Megatron GPT Prompt Tuning TP2 PP1') { when { @@ -4456,46 +4463,47 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' } } - stage('L2: Megatron T5 Prompt Learning TP1 PP1') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel{ - stage('T5 Prompt Learning TP=1 PP=1') { - steps { - sh "python examples/nlp/language_modeling/megatron_t5_prompt_learning.py \ - --config-name=megatron_t5_prompt_learning \ - name='/home/TestData/nlp/prompt_learning/t5_p_tuning_test' \ - trainer.devices=1 \ - trainer.max_steps=1 \ - trainer.val_check_interval=1 \ - trainer.max_epochs=null \ - model.data.num_workers=1 \ - model.language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \ - model.existing_tasks=[] \ - model.new_tasks=['squad'] \ - model.data.train_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \ - model.data.validation_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \ - model.global_batch_size=4 \ - model.micro_batch_size=4" - sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test" - sh "python examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py \ - virtual_prompt_model_file='/home/TestData/nlp/prompt_learning/t5_p_tuning_test.nemo' \ - language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \ - data.test_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \ - pred_file_path='/home/TestData/nlp/prompt_learning/t5_p_tuning_test_preds.txt' \ - data.global_batch_size=4 \ - data.micro_batch_size=4" - sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test.nemo" - sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test_preds.txt" - } - } - } - } + // commented out to save time in github ci, we have tp>1 and pp>1 tests anyway @adithyare + //stage('L2: Megatron T5 Prompt Learning TP1 PP1') { + // when { + // anyOf { + // branch 'main' + // changeRequest target: 'main' + // } + // } + // failFast true + // parallel{ + // stage('T5 Prompt Learning TP=1 PP=1') { + // steps { + // sh "python examples/nlp/language_modeling/megatron_t5_prompt_learning.py \ + // --config-name=megatron_t5_prompt_learning \ + // name='/home/TestData/nlp/prompt_learning/t5_p_tuning_test' \ + // trainer.devices=1 \ + // trainer.max_steps=1 \ + // trainer.val_check_interval=1 \ + // trainer.max_epochs=null \ + // model.data.num_workers=1 \ + // model.language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \ + // model.existing_tasks=[] \ + // model.new_tasks=['squad'] \ + // model.data.train_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \ + // model.data.validation_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \ + // model.global_batch_size=4 \ + // model.micro_batch_size=4" + // sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test" + // sh "python examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py \ + // virtual_prompt_model_file='/home/TestData/nlp/prompt_learning/t5_p_tuning_test.nemo' \ + // language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \ + // data.test_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \ + // pred_file_path='/home/TestData/nlp/prompt_learning/t5_p_tuning_test_preds.txt' \ + // data.global_batch_size=4 \ + // data.micro_batch_size=4" + // sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test.nemo" + // sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test_preds.txt" + // } + // } + // } + //} stage('L2: Megatron T5 Prompt Learning TP2 PP1') { when { From 0b94ef808cd934128a696c9a75aa7f3a37727130 Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Wed, 21 Jun 2023 17:41:31 -0400 Subject: [PATCH 052/123] Fix transcribe_utils.py for hybrid models in partial transcribe mode (#6899) * Fix transcribe_utils.py for hybrid models in partial transcribe mode Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * Update transcribe_utils.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> --------- Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> --- nemo/collections/asr/parts/utils/transcribe_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index 7cf957a7cec0..11e7792cfb21 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -23,8 +23,7 @@ from tqdm.auto import tqdm import nemo.collections.asr as nemo_asr -from nemo.collections.asr.models import ASRModel -from nemo.collections.asr.models.ctc_models import EncDecCTCModel +from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCModel from nemo.collections.asr.parts.utils import rnnt_utils from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchASR from nemo.collections.common.parts.preprocessing.manifest import get_full_path @@ -421,6 +420,8 @@ def transcribe_partial_audio( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) ) logits, logits_len = outputs[0], outputs[1] + if isinstance(asr_model, EncDecHybridRNNTCTCModel) and decoder_type == "ctc": + logits = asr_model.ctc_decoder(encoder_output=logits) if logprobs: # dump log probs per file for idx in range(logits.shape[0]): From 29015df1ee58d141cc00c0c6afe431002cff2201 Mon Sep 17 00:00:00 2001 From: George <37293288+Jorjeous@users.noreply.github.com> Date: Thu, 22 Jun 2023 18:16:48 +0400 Subject: [PATCH 053/123] hot fix SDE (#6897) * hot fix SDE Signed-off-by: George * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: George Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Vitaly Lavrukhin --- tools/speech_data_explorer/data_explorer.py | 301 ++++++++++---------- 1 file changed, 151 insertions(+), 150 deletions(-) diff --git a/tools/speech_data_explorer/data_explorer.py b/tools/speech_data_explorer/data_explorer.py index de2b342a1028..65eafc5c9d49 100755 --- a/tools/speech_data_explorer/data_explorer.py +++ b/tools/speech_data_explorer/data_explorer.py @@ -1138,140 +1138,161 @@ def display_query(query): ] ) - -comparison_layout = [ - html.Div( - [ - dcc.Markdown("model 1:" + ' ' + model_name_1[10:]), - dcc.Markdown("model 2:" + ' ' + model_name_2[10:]), - dcc.Dropdown( - ['word level', 'utterance level'], 'word level', placeholder="choose comparison lvl", id='lvl_choose' - ), - ] - ), - html.Hr(), - html.Div( - [ - html.Div( - [ - dcc.Dropdown(for_col_names.columns[::], 'accuracy_model_' + model_name_1, id='xaxis-column'), - dcc.Dropdown(for_col_names.columns[::], 'accuracy_model_' + model_name_2, id='yaxis-column'), - dcc.Dropdown( - for_col_names.select_dtypes(include='number').columns[::], - placeholder='Select what will encode color of points', - id='color-column', - ), - dcc.Dropdown( - for_col_names.select_dtypes(include='number').columns[::], - placeholder='Select what will encode size of points', - id='size-column', - ), - dcc.Dropdown( - ['yes', 'no'], - placeholder='if you want to enable dot spacing', - id='dot_spacing', - style={'width': '200%'}, - ), - dcc.Input(id='radius', placeholder='Enter radius of spacing (std is 0.01)'), - html.Hr(), - dcc.Input(id='filter-query-input', placeholder='Enter filter query',), - ], - style={'width': '200%', 'display': 'inline-block', 'float': 'middle'}, - ), - html.Hr(), - html.Div(id='filter-query-output'), - dash_table.DataTable( - id='datatable-advanced-filtering', - columns=wordstable_columns_tool, - data=vocabulary_1, - editable=False, - page_action='native', - page_size=5, - filter_action="native", - ), - html.Hr(), - html.Div(id='datatable-query-structure', style={'whitespace': 'pre'}), - html.Hr(), - dbc.Row(dbc.Col(dcc.Graph(id='voc_graph'),),), - html.Hr(), - ], - id='wrd_lvl', - style={'display': 'block'}, - ), - html.Div( - [ - html.Div( - [ - dcc.Dropdown(['WER', 'CER'], 'WER', placeholder="Choose metric", id="choose_metric"), - dbc.Row(dbc.Col(html.H5('Data'), class_name='text-secondary'), class_name='mt-3'), - html.Hr(), - html.Hr(), - dcc.Input(id='filter-query-input-2', placeholder='Enter filter query', style={'width': '100%'}), - html.Div(id='filter-query-output-2'), - dbc.Row( - dbc.Col( - [ - dash_table.DataTable( - id='datatable-advanced-filtering-2', - columns=[ - {'name': k.replace('_', ' '), 'id': k, 'hideable': True} - for k in data_with_metrics[0] - ], - data=data_with_metrics, - editable=False, - page_action='native', - page_size=5, - row_selectable='single', - selected_rows=[0], - page_current=0, - filter_action="native", - style_cell={ - 'overflow': 'hidden', - 'textOverflow': 'ellipsis', - 'maxWidth': 0, - 'textAlign': 'center', - }, - style_header={ - 'color': 'text-primary', - 'text_align': 'center', - 'height': 'auto', - 'whiteSpace': 'normal', - }, - css=[ - { - 'selector': '.dash-spreadsheet-menu', - 'rule': 'position:absolute; bottom: 8px', + comparison_layout = [ + html.Div( + [ + dcc.Markdown("model 1:" + ' ' + model_name_1[10:]), + dcc.Markdown("model 2:" + ' ' + model_name_2[10:]), + dcc.Dropdown( + ['word level', 'utterance level'], + 'word level', + placeholder="choose comparison lvl", + id='lvl_choose', + ), + ] + ), + html.Hr(), + html.Div( + [ + html.Div( + [ + dcc.Dropdown(for_col_names.columns[::], 'accuracy_model_' + model_name_1, id='xaxis-column'), + dcc.Dropdown(for_col_names.columns[::], 'accuracy_model_' + model_name_2, id='yaxis-column'), + dcc.Dropdown( + for_col_names.select_dtypes(include='number').columns[::], + placeholder='Select what will encode color of points', + id='color-column', + ), + dcc.Dropdown( + for_col_names.select_dtypes(include='number').columns[::], + placeholder='Select what will encode size of points', + id='size-column', + ), + dcc.Dropdown( + ['yes', 'no'], + placeholder='if you want to enable dot spacing', + id='dot_spacing', + style={'width': '200%'}, + ), + dcc.Input(id='radius', placeholder='Enter radius of spacing (std is 0.01)'), + html.Hr(), + dcc.Input(id='filter-query-input', placeholder='Enter filter query',), + ], + style={'width': '200%', 'display': 'inline-block', 'float': 'middle'}, + ), + html.Hr(), + html.Div(id='filter-query-output'), + dash_table.DataTable( + id='datatable-advanced-filtering', + columns=wordstable_columns_tool, + data=vocabulary_1, + editable=False, + page_action='native', + page_size=5, + filter_action="native", + ), + html.Hr(), + html.Div(id='datatable-query-structure', style={'whitespace': 'pre'}), + html.Hr(), + dbc.Row(dbc.Col(dcc.Graph(id='voc_graph'),),), + html.Hr(), + ], + id='wrd_lvl', + style={'display': 'block'}, + ), + html.Div( + [ + html.Div( + [ + dcc.Dropdown(['WER', 'CER'], 'WER', placeholder="Choose metric", id="choose_metric"), + dbc.Row(dbc.Col(html.H5('Data'), class_name='text-secondary'), class_name='mt-3'), + html.Hr(), + html.Hr(), + dcc.Input( + id='filter-query-input-2', placeholder='Enter filter query', style={'width': '100%'} + ), + html.Div(id='filter-query-output-2'), + dbc.Row( + dbc.Col( + [ + dash_table.DataTable( + id='datatable-advanced-filtering-2', + columns=[ + {'name': k.replace('_', ' '), 'id': k, 'hideable': True} + for k in data_with_metrics[0] + ], + data=data_with_metrics, + editable=False, + page_action='native', + page_size=5, + row_selectable='single', + selected_rows=[0], + page_current=0, + filter_action="native", + style_cell={ + 'overflow': 'hidden', + 'textOverflow': 'ellipsis', + 'maxWidth': 0, + 'textAlign': 'center', }, - {'selector': '.dash-filter--case', 'rule': 'display: none'}, - {'selector': '.column-header--hide', 'rule': 'display: none'}, - ], + style_header={ + 'color': 'text-primary', + 'text_align': 'center', + 'height': 'auto', + 'whiteSpace': 'normal', + }, + css=[ + { + 'selector': '.dash-spreadsheet-menu', + 'rule': 'position:absolute; bottom: 8px', + }, + {'selector': '.dash-filter--case', 'rule': 'display: none'}, + {'selector': '.column-header--hide', 'rule': 'display: none'}, + ], + ), + dbc.Row(dbc.Col(html.Audio(id='player-1', controls=True),), class_name='mt-3'), + ] + ) + ), + ] + + [ + dbc.Row( + [ + dbc.Col( + html.Div(children=k.replace('_', '-')), + width=2, + class_name='mt-1 bg-light font-monospace text-break small rounded border', + ), + dbc.Col( + html.Div(id='__' + k), + class_name='mt-1 bg-light font-monospace text-break small rounded border', ), - dbc.Row(dbc.Col(html.Audio(id='player-1', controls=True),), class_name='mt-3'), ] ) - ), - ] - + [ - dbc.Row( - [ - dbc.Col( - html.Div(children=k.replace('_', '-')), - width=2, - class_name='mt-1 bg-light font-monospace text-break small rounded border', - ), - dbc.Col( - html.Div(id='__' + k), - class_name='mt-1 bg-light font-monospace text-break small rounded border', - ), - ] - ) - for k in data_with_metrics[0] - ] - ), - ], - id='unt_lvl', - ), -] + for k in data_with_metrics[0] + ] + ), + ], + id='unt_lvl', + ), + ] + [ + html.Div( + [ + html.Div( + [ + dbc.Row(dbc.Col(dcc.Graph(id='utt_graph'),),), + html.Hr(), + dcc.Input(id='clicked_aidopath', style={'width': '100%'}), + html.Hr(), + dcc.Input(id='my-output-1', style={'display': 'none'}), # we do need this + ] + ), + html.Div([dbc.Row(dbc.Col(dcc.Graph(id='signal-graph-1')), class_name='mt-3'),]), + ], + id='down_thing', + style={'display': 'block'}, + ) + ] if args.show_statistics is not None: comparison_layout += [ @@ -1329,26 +1350,6 @@ def show_hide_element(visibility_state): ) -comparison_layout += [ - html.Div( - [ - html.Div( - [ - dbc.Row(dbc.Col(dcc.Graph(id='utt_graph'),),), - html.Hr(), - dcc.Input(id='clicked_aidopath', style={'width': '100%'}), - html.Hr(), - dcc.Input(id='my-output-1', style={'display': 'none'}), # we do need this - ] - ), - html.Div([dbc.Row(dbc.Col(dcc.Graph(id='signal-graph-1')), class_name='mt-3'),]), - ], - id='down_thing', - style={'display': 'block'}, - ) -] - - if args.show_statistics is None: @app.callback( From a8609ab6a83377f30d42f6b225412f28b4b8f05b Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Thu, 22 Jun 2023 09:22:27 -0700 Subject: [PATCH 054/123] fix ptuning residuals bug (#6866) * fix for lora bug and makes ptuning w peft framework compatible with FT inference Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update Signed-off-by: arendu * simple forward call for adapters with residual Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates Signed-off-by: arendu --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../megatron_gpt_peft_models.py | 16 ++++- .../megatron/adapters/parallel_adapters.py | 58 +++++++++++++------ .../modules/common/megatron/language_model.py | 5 +- .../modules/common/megatron/transformer.py | 21 +++---- .../nlp/modules/common/prompt_encoder.py | 5 +- 5 files changed, 66 insertions(+), 39 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py index 930bfbc8cf25..f1f44e31e175 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py @@ -225,6 +225,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.name_key_to_cfg = {AdapterName.PTUNING_ADAPTER: adapter_cfg} super().__init__(cfg, trainer) self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens + self.trainable_keys = self.adapter_keys - set( + [ + "model.language_model.adapter_layer.ptuning_adapter.inference_table.prompt_table.taskname.prompt_embeddings.weight" + ] + ) + # we exclude the above parameter from training because it is present for backward compatibility for inference using FasterTransformer (@adithyare) def init_peft_modules(self,): """ @@ -268,7 +274,15 @@ def load_state_dict(self, state_dict, strict: bool = True): def setup_optimizer_param_groups(self): if self.first_stage_of_pipeline(): - super().setup_optimizer_param_groups() + # super().setup_optimizer_param_groups() + self.freeze() # Freeze the entire model + opt_params = [] + for n, p in self.named_parameters(): + if n in self.trainable_keys: + p.requires_grad = True + opt_params.append(p) + + self._optimizer_param_groups = ({"params": opt_params},) else: self.freeze() # Freeze the entire model self._optimizer_param_groups = ({"params": []},) diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index 679020019ab1..fe339c6f9a8b 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -27,6 +27,7 @@ from nemo.collections.common.parts.utils import activation_registry from nemo.collections.nlp.modules.common.megatron.fused_bias_gelu import fused_bias_gelu from nemo.collections.nlp.modules.common.megatron.utils import init_method_const, init_method_normal +from nemo.collections.nlp.modules.common.prompt_encoder import InferenceTable from nemo.core.classes.mixins import adapter_mixin_strategies try: @@ -65,13 +66,11 @@ class AdapterName(str, enum.Enum): class InfusedAdapter(nn.Module, AdapterModuleUtil): - def __init__( - self, in_features: int, adapter_strategy: adapter_mixin_strategies.ResidualAddAdapterStrategyConfig = None, - ) -> None: + def __init__(self, in_features: int,) -> None: super().__init__() self.scalers = nn.Parameter(torch.ones(in_features)) # Setup adapter strategy - self.setup_adapter_strategy(adapter_strategy) + self.setup_adapter_strategy(adapter_mixin_strategies.ReturnResultAdapterStrategy()) def forward(self, x): x = x * self.scalers[None, None, :] @@ -90,7 +89,6 @@ class MLPInfusedAdapter(InfusedAdapter): @dataclass class InfusedAdapterConfig: in_features: int - adapter_strategy: Optional[Any] = adapter_mixin_strategies.ResidualAddAdapterStrategyConfig() _target_: str = "{0}.{1}".format(InfusedAdapter.__module__, InfusedAdapter.__name__) @@ -112,7 +110,6 @@ def __init__( row_init_method: str = 'zero', # TODO: (@adithyare) should rename this to output_init_method to be more precise. gather_output: bool = True, dropout: float = 0.0, - adapter_strategy: adapter_mixin_strategies.ResidualAddAdapterStrategyConfig = None, ): super().__init__() if not HAVE_APEX: @@ -153,7 +150,7 @@ def __init__( self.dropout = None # Setup adapter strategy - self.setup_adapter_strategy(adapter_strategy) + self.setup_adapter_strategy(adapter_mixin_strategies.ReturnResultAdapterStrategy()) def _get_init_fn(self, init_method: str): if init_method == 'xavier': @@ -196,7 +193,6 @@ class ParallelLinearAdapterConfig: row_init_method: str = 'zero' gather_output: bool = True dropout: float = 0.0 - adapter_strategy: Optional[Any] = adapter_mixin_strategies.ResidualAddAdapterStrategyConfig() _target_: str = "{0}.{1}".format(ParallelLinearAdapter.__module__, ParallelLinearAdapter.__name__) @@ -250,13 +246,7 @@ class PromptEncoderAdapter(nn.Module, AdapterModuleUtil): """ def __init__( - self, - virtual_tokens: int, - bottleneck_dim: int, - embedding_dim: int, - init_std: float, - output_dim: int, - adapter_strategy: adapter_mixin_strategies.ResidualAddAdapterStrategyConfig = None, + self, virtual_tokens: int, bottleneck_dim: int, embedding_dim: int, init_std: float, output_dim: int, ): """ Initializes the Tensor Model parallel MLP PromptEncoderMLP module. @@ -278,6 +268,7 @@ def __init__( # (@adithyare) the persistent=False will not pollute the indices into the state_dict of this module. self.register_buffer("indices", torch.LongTensor(list(range(self.virtual_tokens))), persistent=False) self.embedding = torch.nn.Embedding(self.virtual_tokens, self.embedding_dim) + self.inference_table = InferenceTable("taskname", self.embedding_dim, self.virtual_tokens) self.first = ColumnParallelLinear( self.embedding_dim, self.bottleneck_dim, @@ -301,15 +292,47 @@ def __init__( gradient_accumulation_fusion=gradient_accumulation_fusion, ) # Setup adapter strategy - self.setup_adapter_strategy(adapter_strategy) + self.setup_adapter_strategy(adapter_mixin_strategies.ReturnResultAdapterStrategy()) + + def set_inference_table(self, prompt_representation: torch.Tensor): + """ + This method caches the output representation from the Encoder and saves it inside `self.inference_table`. + """ + prompt_representation = prompt_representation.detach().clone() + self.inference_table.set_prompt_table(prompt_representation) + + def clear_inference_table(self,): + self.inference_table.clear_prompt_table() + + def get_inference_table(self,): + return self.inference_table.get_prompt_table() - def forward(self, batch_size): + def inner_forward(self,): input_embeds = self.embedding(self.indices).unsqueeze(0) intermediate_parallel, bias_parallel = self.first(input_embeds) intermediate_parallel = fused_bias_gelu(intermediate_parallel, bias_parallel) output_embeds, bias_parallel = self.second(intermediate_parallel) output_embeds = output_embeds + bias_parallel output_embeds = output_embeds.transpose(0, 1) + return output_embeds + + def forward(self, batch_size: int, use_cached_reps: bool = False) -> torch.Tensor: + """ + Forward pass through the encoder with caching of prompt representations + """ + if use_cached_reps: + output_embeds = self.get_inference_table().unsqueeze(1) + else: + if self.training: + if self.inference_table.is_inference_ready: + self.clear_inference_table() + output_embeds = self.inner_forward() + else: + if not self.inference_table.is_inference_ready: + output_embeds = self.inner_forward() + self.set_inference_table(output_embeds.squeeze(1)) + output_embeds = self.get_inference_table().unsqueeze(1) + output_embeds = output_embeds.expand(self.virtual_tokens, batch_size, self.output_dim) return output_embeds @@ -321,5 +344,4 @@ class PromptEncoderAdapterConfig: embedding_dim: int init_std: float output_dim: int - adapter_strategy: Optional[Any] = adapter_mixin_strategies.ResidualAddAdapterStrategyConfig() _target_: str = "{0}.{1}".format(PromptEncoderAdapter.__module__, PromptEncoderAdapter.__name__) diff --git a/nemo/collections/nlp/modules/common/megatron/language_model.py b/nemo/collections/nlp/modules/common/megatron/language_model.py index 2d10576dc7d0..a3fa3fd6d2be 100755 --- a/nemo/collections/nlp/modules/common/megatron/language_model.py +++ b/nemo/collections/nlp/modules/common/megatron/language_model.py @@ -746,10 +746,7 @@ def forward( ptuning_adapter = self.get_adapter_module(AdapterName.PTUNING_ADAPTER) v = ptuning_adapter.virtual_tokens if ptuning_adapter and _sq >= v: # The sequence should be longer the v to insert virtual embeddings. - strategy = ptuning_adapter.adapter_strategy - virtual_embeddings = self.forward_single_enabled_adapter_( - _bs, ptuning_adapter, adapter_name=AdapterName.PTUNING_ADAPTER, adapter_strategy=strategy, - ) + virtual_embeddings = ptuning_adapter(_bs) encoder_input = encoder_input[ v:, :, : ] # the first v tokens are pads so that they can be swapped out with virtual embeddings. diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index 8a0b22b4d289..ea01acd14a23 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -549,13 +549,9 @@ def forward( if self.is_adapter_available(): adapter_1 = self.get_adapter_module(AdapterName.PRE_ATTN_ADAPTER) if adapter_1: - strategy = adapter_1.adapter_strategy - attention_output = self.forward_single_enabled_adapter_( - attention_output, - adapter_1, - adapter_name=AdapterName.PRE_ATTN_ADAPTER, - adapter_strategy=strategy, - ) + attention_output = ( + adapter_1(attention_output) + attention_output + ) # simple adapter call with residual connection layernorm_input = bias_dropout_add_func(attention_output, attention_bias, residual, self.hidden_dropout) # print(f"Layer: {self.layer_number} Attention checksum {layernorm_input.sum()}") @@ -626,15 +622,12 @@ def forward( layernorm_input = normalization_output # MLP. mlp_output, mlp_bias = self.mlp(normalization_output) - if ( - self.is_adapter_available() - ): # TODO: (@adithyre) was able to move adapter_2 back to the end of the transformer after ptl 1.7 update. + if self.is_adapter_available(): + # TODO: (@adithyre) was able to move adapter_2 back to the end of the transformer after ptl 1.7 update. adapter_2 = self.get_adapter_module(AdapterName.POST_ATTN_ADAPTER) if adapter_2: - strategy = adapter_2.adapter_strategy - mlp_output = self.forward_single_enabled_adapter_( - mlp_output, adapter_2, adapter_name=AdapterName.POST_ATTN_ADAPTER, adapter_strategy=strategy - ) + mlp_output = adapter_2(mlp_output) + mlp_output # simple adapter call with residual connection + residual = layernorm_input bias_dropout_add_func = self._get_bias_droput_add_func( diff --git a/nemo/collections/nlp/modules/common/prompt_encoder.py b/nemo/collections/nlp/modules/common/prompt_encoder.py index 282ad053bc86..283608367b62 100644 --- a/nemo/collections/nlp/modules/common/prompt_encoder.py +++ b/nemo/collections/nlp/modules/common/prompt_encoder.py @@ -70,7 +70,7 @@ def __init__( self.prompt_embeddings.weight.requires_grad = False # Set fixed indicies for forward pass - self.register_buffer('indices', torch.LongTensor(list(range(self.total_virtual_tokens)))) + self.register_buffer("indices", torch.LongTensor(list(range(self.total_virtual_tokens))), persistent=False) def clear_prompt_embedding_weights(self,): """ @@ -104,9 +104,10 @@ def __init__(self, taskname, hidden_size, total_virtual_tokens, is_inference_rea self.total_virtual_tokens = total_virtual_tokens self.prompt_table = torch.nn.ModuleDict() self.prompt_table[self.taskname] = PromptEmbedding(self.hidden_size, self.total_virtual_tokens) - self.prompt_table[self.taskname].prompt_embeddings.weight.requires_grad = False self.prompt_table[self.taskname].clear_prompt_embedding_weights() self.is_inference_ready = is_inference_ready + for p in self.prompt_table.parameters(): + p.requires_grad = False def set_prompt_table(self, prompt_representation: torch.Tensor): """ From 698a5f77297cdbd9bb8f2a926f13b7c4186c8863 Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Thu, 22 Jun 2023 14:37:08 -0400 Subject: [PATCH 055/123] Add hybrid model support to transcribe_speech_parallel.py (#6906) * Add hybrid model support to transcribe_speech_parallel.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update audio_to_text_dataset.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> --------- Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- examples/asr/transcribe_speech_parallel.py | 18 +++++++++++++++++- .../asr/data/audio_to_text_dataset.py | 1 + 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/examples/asr/transcribe_speech_parallel.py b/examples/asr/transcribe_speech_parallel.py index 74019d7668f0..f14df284c6b1 100644 --- a/examples/asr/transcribe_speech_parallel.py +++ b/examples/asr/transcribe_speech_parallel.py @@ -32,6 +32,15 @@ predict_ds.batch_size=16 \ output_path=/tmp/ +Example for Hybrid-CTC/RNNT models with non-tarred datasets: + +python transcribe_speech_parallel.py \ + model=stt_en_fastconformer_hybrid_large \ + decoder_type=ctc \ + predict_ds.manifest_filepath=/dataset/manifest_file.json \ + predict_ds.batch_size=16 \ + output_path=/tmp/ + Example for tarred datasets: python transcribe_speech_parallel.py \ @@ -73,7 +82,7 @@ from nemo.collections.asr.data.audio_to_text_dataset import ASRPredictionWriter from nemo.collections.asr.metrics.rnnt_wer import RNNTDecodingConfig from nemo.collections.asr.metrics.wer import word_error_rate -from nemo.collections.asr.models import ASRModel +from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCModel from nemo.collections.asr.models.configs.asr_models_config import ASRDatasetConfig from nemo.core.config import TrainerConfig, hydra_runner from nemo.utils import logging @@ -92,6 +101,10 @@ class ParallelTranscriptionConfig: # decoding strategy for RNNT models rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig() + + # decoder for hybrid models, must be one of 'ctc', 'rnnt' if not None + decoder_type: Optional[str] = None + trainer: TrainerConfig = TrainerConfig(devices=-1, accelerator="gpu", strategy="ddp") @@ -137,6 +150,9 @@ def main(cfg: ParallelTranscriptionConfig): ) model = ASRModel.from_pretrained(model_name=cfg.model, map_location="cpu") + if isinstance(model, EncDecHybridRNNTCTCModel) and cfg.decoder_type is not None: + model.change_decoding_strategy(decoder_type=cfg.decoder_type) + trainer = ptl.Trainer(**cfg.trainer) cfg.predict_ds.return_sample_id = True diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py index 14e8dea19651..d5dcc8be4847 100644 --- a/nemo/collections/asr/data/audio_to_text_dataset.py +++ b/nemo/collections/asr/data/audio_to_text_dataset.py @@ -713,6 +713,7 @@ def write_on_batch_end( item = {} sample = self.dataset.get_manifest_sample(sample_id) item["audio_filepath"] = sample.audio_file + item["offset"] = sample.offset item["duration"] = sample.duration item["text"] = sample.text_raw item["pred_text"] = transcribed_text From d870644fdabc820c5307703f264fc63e470a0ce9 Mon Sep 17 00:00:00 2001 From: Yi Dong <43824965+yidong72@users.noreply.github.com> Date: Thu, 22 Jun 2023 15:27:44 -0400 Subject: [PATCH 056/123] Make Gradio library optional (#6904) * make gradio optinoal Signed-off-by: Yi Dong * update readme Signed-off-by: Yi Dong --------- Signed-off-by: Yi Dong --- README.rst | 8 ++++++ .../nlp/modules/common/chatbot_component.py | 22 +++++++++++++++- .../nlp/modules/common/megatron_web_server.py | 26 ++++++++++++++++--- requirements/requirements_nlp.txt | 1 - 4 files changed, 51 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 6742eb1f07d4..869782ab372f 100644 --- a/README.rst +++ b/README.rst @@ -290,6 +290,14 @@ Transformer Engine already supports Flash Attention for GPT models. If you want pip install flash-attn pip install triton==2.0.0.dev20221202 +NLP inference UI +~~~~~~~~~~~~~~~~~~~~ +To launch the inference web UI server, please install the gradio `gradio `_. + +.. code-block:: bash + + pip install gradio==3.34.0 + NeMo Text Processing ~~~~~~~~~~~~~~~~~~~~ NeMo Text Processing, specifically (Inverse) Text Normalization, is now a separate repository `https://github.com/NVIDIA/NeMo-text-processing `_. diff --git a/nemo/collections/nlp/modules/common/chatbot_component.py b/nemo/collections/nlp/modules/common/chatbot_component.py index 548458df7e29..afc86d9defec 100644 --- a/nemo/collections/nlp/modules/common/chatbot_component.py +++ b/nemo/collections/nlp/modules/common/chatbot_component.py @@ -19,9 +19,29 @@ """ from __future__ import annotations -from gradio.components import * +import warnings + from markdown2 import Markdown +try: + from typing import Any, Callable, Dict, List, Literal, Tuple + + from gradio.components import ( + Changeable, + Component, + Enum, + EventListenerMethod, + IOComponent, + JSONSerializable, + Selectable, + document, + processing_utils, + ) + + GRADIO_AVAILABLE = True +except (ImportError, ModuleNotFoundError): + GRADIO_AVAILABLE = False + class _Keywords(Enum): NO_VALUE = "NO_VALUE" # Used as a sentinel to determine if nothing is provided as a argument for `value` in `Component.update()` diff --git a/nemo/collections/nlp/modules/common/megatron_web_server.py b/nemo/collections/nlp/modules/common/megatron_web_server.py index 884f7abe5f01..d3ccde49a5c5 100644 --- a/nemo/collections/nlp/modules/common/megatron_web_server.py +++ b/nemo/collections/nlp/modules/common/megatron_web_server.py @@ -14,10 +14,14 @@ import asyncio -import gradio as gr +try: + import gradio as gr + + GRADIO_AVAILABLE = True +except (ImportError, ModuleNotFoundError): + GRADIO_AVAILABLE = False from nemo.collections.nlp.modules.common.chat_css import CSS -from nemo.collections.nlp.modules.common.chatbot_component import Chatbot from nemo.collections.nlp.modules.common.megatron.retrieval_services.util import ( convert_retrieved_to_md, request_data, @@ -30,8 +34,17 @@ DEFAULT_SYSTEM = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n" SYSTEM_TOKEN = 'System\n' -# HUMAN_TOKEN = 'Human:' -# ASSITANT_TOKEN = 'Assistant:' + + +def check_gradio_import(): + if not GRADIO_AVAILABLE: + msg = ( + f"could not find the gradio library.\n" + f"****************************************************************\n" + f"To install it, please follow the steps below:\n" + f"pip install gradio==3.34.0\n" + ) + raise ImportError(msg) def create_gen_function(port=5555, chat=False): @@ -89,6 +102,7 @@ def get_generation( def get_demo(share, username, password, server_port=5555, web_port=9889, loop=None): + check_gradio_import() asyncio.set_event_loop(loop) with gr.Blocks() as demo: with gr.Row(): @@ -132,6 +146,9 @@ def get_demo(share, username, password, server_port=5555, web_port=9889, loop=No def get_chatbot_demo(share, username, password, server_port=5555, web_port=9889, loop=None): + check_gradio_import() + from nemo.collections.nlp.modules.common.chatbot_component import Chatbot + asyncio.set_event_loop(loop) with gr.Blocks(css=CSS) as demo: # store the mutliple turn conversation @@ -294,6 +311,7 @@ def reset_index(self): return request_data(data, self.combo_service_ip, self.combo_service_port) def run_demo(self, share, username, password, port): + check_gradio_import() with gr.Blocks(css="table, th, td { border: 1px solid blue; table-layout: fixed; width: 100%; }") as demo: with gr.Row(): with gr.Column(scale=2, width=200): diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 582862361a22..2018de6fbc31 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -5,7 +5,6 @@ fasttext flask_restful ftfy gdown -gradio>=3.28.3 h5py ijson inflect From 4726650aee4d97900f5735262bf0c64d10dd50cc Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Thu, 22 Jun 2023 19:06:52 -0400 Subject: [PATCH 057/123] Update Frame-VAD doc (#6902) * update fvad doc Signed-off-by: stevehuang52 * fix typo Signed-off-by: stevehuang52 --------- Signed-off-by: stevehuang52 Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> --- examples/asr/asr_vad/README.md | 12 ++- examples/asr/speech_classification/README.md | 97 +++++++++++++++---- .../speech_classification/frame_vad_infer.py | 7 ++ .../speech_to_frame_label.py | 2 +- .../Offline_ASR_with_VAD_for_CTC_models.ipynb | 17 +++- tutorials/asr/Voice_Activity_Detection.ipynb | 31 +++++- 6 files changed, 143 insertions(+), 23 deletions(-) diff --git a/examples/asr/asr_vad/README.md b/examples/asr/asr_vad/README.md index 9385b96a79ea..f39b9735b20f 100644 --- a/examples/asr/asr_vad/README.md +++ b/examples/asr/asr_vad/README.md @@ -8,10 +8,16 @@ There are two types of input - A manifest passed to `manifest_filepath`, - A directory containing audios passed to `audio_dir` and also specify `audio_type` (default to `wav`). -The input manifest must be a manifest json file, where each line is a Python dictionary. The fields ["audio_filepath", "offset", "duration", "text"] are required. An example of a manifest file is: +The input manifest must be a manifest json file, where each line is a Python dictionary. The fields ["audio_filepath", "offset", "duration"] are required. An example of a manifest file is: ```json -{"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000, "text": "a b c d e"} -{"audio_filepath": "/path/to/audio_file2", "offset": 0, "duration": 10000, "text": "f g h i j"} +{"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000} +{"audio_filepath": "/path/to/audio_file2", "offset": 0, "duration": 10000} +``` + +If you want to calculate WER, provide `text` in manifest as groundtruth. An example of a manifest file is: +```json +{"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000, "text": "hello world"} +{"audio_filepath": "/path/to/audio_file2", "offset": 0, "duration": 10000, "text": "hello world"} ``` ## Output diff --git a/examples/asr/speech_classification/README.md b/examples/asr/speech_classification/README.md index 86bba3dc65a4..4fa5d3c4f2b3 100644 --- a/examples/asr/speech_classification/README.md +++ b/examples/asr/speech_classification/README.md @@ -1,25 +1,88 @@ # Speech Classification -This directory contains example scripts to train speech classification and voice activity detection models. +This directory contains example scripts to train speech classification and voice activity detection models. There are two types of VAD models: Frame-VAD and Segment-VAD. -# Model execution overview +## Frame-VAD -The training scripts in this directory execute in the following order. When preparing your own training-from-scratch / fine-tuning scripts, please follow this order for correct training/inference. +The frame-level VAD model predicts for each frame of the audio whether it has speech or not. For example, with the default config file (`../conf/marblenet/marblenet_3x2x64_20ms.yaml`), the model provides a probability for each frame of 20ms length. -```mermaid +### Training +```sh +python speech_to_label.py \ + --config-path= + --config-name= \ + model.train_ds.manifest_filepath="[,]" \ + model.validation_ds.manifest_filepath=["",""] \ + trainer.devices=-1 \ + trainer.accelerator="gpu" \ + strategy="ddp" \ + trainer.max_epochs=100 +``` + +The input manifest must be a manifest json file, where each line is a Python dictionary. The fields ["audio_filepath", "offset", "duration", "label"] are required. An example of a manifest file is: +``` +{"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000, "label": "0 1 0 0 1"} +{"audio_filepath": "/path/to/audio_file2", "offset": 0, "duration": 10000, "label": "0 0 0 1 1 1 1 0 0"} +``` +For example, if you have a 1s audio file, you'll need to have 50 frame labels in the manifest entry like "0 0 0 0 1 1 0 1 .... 0 1". +However, shorter label strings are also supported for smaller file sizes. For example, you can prepare the `label` in 40ms frame, and the model will properly repeat the label for each 20ms frame. + + +### Inference +python frame_vad_infer.py \ + --config-path="../conf/vad" --config-name="frame_vad_infer_postprocess" \ + dataset= + +The manifest json file should have the following format (each line is a Python dictionary): +``` +{"audio_filepath": "/path/to/audio_file1.wav", "offset": 0, "duration": 10000} +{"audio_filepath": "/path/to/audio_file2.wav", "offset": 0, "duration": 10000} +``` + +#### Evaluation +If you want to evaluate tne model's AUROC and DER performance, you need to set `evaluate: True` in config yaml (e.g., `../conf/vad/frame_vad_infer_postprocess.yaml`), and also provide groundtruth in label strings: +``` +{"audio_filepath": "/path/to/audio_file1.wav", "offset": 0, "duration": 10000, "label": "0 1 0 0 0 1 1 1 0"} +``` +or RTTM files: +``` +{"audio_filepath": "/path/to/audio_file1.wav", "offset": 0, "duration": 10000, "rttm_filepath": "/path/to/rttm_file1.rttm"} +``` + + +## Segment-VAD + +Segment-level VAD predicts a single label for each segment of audio (e.g., 0.63s by default). + +### Training +```sh +python speech_to_label.py \ + --config-path= \ + --config-name= \ + model.train_ds.manifest_filepath="[,]" \ + model.validation_ds.manifest_filepath=["",""] \ + trainer.devices=-1 \ + trainer.accelerator="gpu" \ + strategy="ddp" \ + trainer.max_epochs=100 +``` -graph TD - A[Hydra Overrides + Yaml Config] --> B{Config} - B --> |Init| C[Trainer] - C --> D[ExpManager] - B --> D[ExpManager] - C --> E[Model] - B --> |Init| E[Model] - E --> |Constructor| F(Change Labels) - F --> G(Setup Train + Validation + Test Data loaders) - G --> H(Setup Optimization) - H --> I[Maybe init from pretrained] - I --> J["trainer.fit(model)"] +The input manifest must be a manifest json file, where each line is a Python dictionary. The fields ["audio_filepath", "offset", "duration", "label"] are required. An example of a manifest file is: +``` +{"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 0.63, "label": "0"} +{"audio_filepath": "/path/to/audio_file2", "offset": 0, "duration": 0.63, "label": "1"} ``` -During restoration of the model, you may pass the Trainer to the restore_from / from_pretrained call, or set it after the model has been initialized by using `model.set_trainer(Trainer)`. \ No newline at end of file + +### Inference +```sh +python vad_infer.py \ + --config-path="../conf/vad" \ + --config-name="vad_inference_postprocessing.yaml" + dataset= +``` +The manifest json file should have the following format (each line is a Python dictionary): +``` +{"audio_filepath": "/path/to/audio_file1.wav", "offset": 0, "duration": 10000} +{"audio_filepath": "/path/to/audio_file2.wav", "offset": 0, "duration": 10000} +``` diff --git a/examples/asr/speech_classification/frame_vad_infer.py b/examples/asr/speech_classification/frame_vad_infer.py index 56eb7584e3db..f716eb45bb64 100644 --- a/examples/asr/speech_classification/frame_vad_infer.py +++ b/examples/asr/speech_classification/frame_vad_infer.py @@ -26,6 +26,13 @@ The manifest json file should have the following format (each line is a Python dictionary): {"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000} {"audio_filepath": "/path/to/audio_file2", "offset": 0, "duration": 10000} + +If you want to evaluate tne model's AUROC and DER performance, you need to set `evaluate=True` in config yaml, +and also provide groundtruth in either RTTM files or label strings: +{"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000, "label": "0 1 0 0 0 1 1 1 0"} +or +{"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000, "rttm_filepath": "/path/to/rttm_file1.rttm"} + """ import os diff --git a/examples/asr/speech_classification/speech_to_frame_label.py b/examples/asr/speech_classification/speech_to_frame_label.py index 3289845ec3d3..04fcbdd1b61c 100644 --- a/examples/asr/speech_classification/speech_to_frame_label.py +++ b/examples/asr/speech_classification/speech_to_frame_label.py @@ -32,7 +32,7 @@ The input manifest must be a manifest json file, where each line is a Python dictionary. The fields ["audio_filepath", "offset", "duration", "label"] are required. An example of a manifest file is: ``` {"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000, "label": "0 1 0 0 1"} -{"audio_filepath": "/path/to/audio_file2", "offset": 0, "duration": 10000, "text": "0 0 0 1 1 1 1 0 0"} +{"audio_filepath": "/path/to/audio_file2", "offset": 0, "duration": 10000, "label": "0 0 0 1 1 1 1 0 0"} ``` For example, if you have a 1s audio file, you'll need to have 50 frame labels in the manifest entry like "0 0 0 0 1 1 0 1 .... 0 1". However, shorter label strings are also supported for smaller file sizes. For example, you can prepare the `label` in 40ms frame, and the model will properly repeat the label for each 20ms frame. diff --git a/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb b/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb index 1445afe9e381..7e9d0378bc1f 100644 --- a/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb +++ b/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb @@ -50,6 +50,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -57,6 +58,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -72,6 +74,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -132,6 +135,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -139,6 +143,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -154,6 +159,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -182,6 +188,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -198,6 +205,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -215,6 +223,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -239,6 +248,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -255,6 +265,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -262,6 +273,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -289,6 +301,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -313,6 +326,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -320,6 +334,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -374,7 +389,7 @@ "source": [ "# Further Reading\n", "\n", - "There are two ways to incorporate VAD into ASR pipeline. The first strategy is to drop the frames that are predicted as `non-speech` by VAD, as already discussed in this tutorial. The second strategy is to keep all the frames and mask the `non-speech` frames with zero-signal values. Also, instead of using segment-VAD as shown in this tutorial, we can use frame-VAD model for faster inference and better accuracy. For more information, please refer to the two scripts [speech_to_text_with_vad.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr_vad/speech_to_text_with_vad.py)." + "There are two ways to incorporate VAD into ASR pipeline. The first strategy is to drop the frames that are predicted as `non-speech` by VAD, as already discussed in this tutorial. The second strategy is to keep all the frames and mask the `non-speech` frames with zero-signal values. Also, instead of using segment-VAD as shown in this tutorial, we can use frame-VAD model for faster inference and better accuracy. For more information, please refer to the script [speech_to_text_with_vad.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr_vad/speech_to_text_with_vad.py)." ] } ], diff --git a/tutorials/asr/Voice_Activity_Detection.ipynb b/tutorials/asr/Voice_Activity_Detection.ipynb index 8b95698c71e8..b1bdd434511b 100644 --- a/tutorials/asr/Voice_Activity_Detection.ipynb +++ b/tutorials/asr/Voice_Activity_Detection.ipynb @@ -41,6 +41,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -81,6 +82,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab": {}, @@ -98,6 +100,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -172,6 +175,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -205,6 +209,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -244,6 +249,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -272,6 +278,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -336,6 +343,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -362,6 +370,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -391,6 +400,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -469,6 +479,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -522,6 +533,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -548,6 +560,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -585,6 +598,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -592,6 +606,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -628,6 +643,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -652,6 +668,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -671,6 +688,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -699,6 +717,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -711,6 +730,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -723,6 +743,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -751,6 +772,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -835,6 +857,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -918,6 +941,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -943,6 +967,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -969,6 +994,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -1068,6 +1094,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1104,6 +1131,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1117,6 +1145,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "colab_type": "text", @@ -1143,7 +1172,7 @@ "\n", "During inference, since frame-VAD model doesn't require splicing input into overlapping segments, it is more efficient than segment-VAD model, with 8x less GPU memory consumption.\n", "\n", - "For more information on the frame-VAD model, please refer to the [model class](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/models/classification_models.py#L840). For training and running inference on frame-VAD, please refer to [speech_to_frame_label.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr/speech_classification/speech_to_frame_label.py) and [frame_vad_infer.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr/speech_classification/frame_vad_infer.py)." + "For more information on the frame-VAD model, please refer to the [README.md](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr/speech_classification/README.md). For training and running inference on frame-VAD, please refer to [speech_to_frame_label.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr/speech_classification/speech_to_frame_label.py) and [frame_vad_infer.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr/speech_classification/frame_vad_infer.py)." ] } ], From 722e77cd89c8a2ce5bf9a4052efb1e5ef7f327b2 Mon Sep 17 00:00:00 2001 From: fayejf <36722593+fayejf@users.noreply.github.com> Date: Fri, 23 Jun 2023 10:49:26 -0700 Subject: [PATCH 058/123] Update container info in README.rst (#6913) Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com> --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 869782ab372f..8a788da71550 100644 --- a/README.rst +++ b/README.rst @@ -304,13 +304,13 @@ NeMo Text Processing, specifically (Inverse) Text Normalization, is now a separa Docker containers: ~~~~~~~~~~~~~~~~~~ -We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.18.1`` comes with container ``nemo:23.03``, you may find more details about released containers in `releases page `_. +We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.19.0`` comes with container ``nemo:23.04``, you may find more details about released containers in `releases page `_. To use built container, please run .. code-block:: bash - docker pull nvcr.io/nvidia/nemo:23.03 + docker pull nvcr.io/nvidia/nemo:23.04 To build a nemo container with Dockerfile from a branch, please run From 74cbbb2859c0093dbde0e8aeedf0fc6d65849790 Mon Sep 17 00:00:00 2001 From: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com> Date: Sat, 24 Jun 2023 03:19:19 +0800 Subject: [PATCH 059/123] Fix fast-glu activation in change partitions (#6909) * Fix fast-swiglu Signed-off-by: hsiehjackson * change to all fast glu activation Signed-off-by: hsiehjackson --------- Signed-off-by: hsiehjackson --- .../megatron_change_num_partitions.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_change_num_partitions.py b/examples/nlp/language_modeling/megatron_change_num_partitions.py index 2938a16098a1..72655089e0ee 100644 --- a/examples/nlp/language_modeling/megatron_change_num_partitions.py +++ b/examples/nlp/language_modeling/megatron_change_num_partitions.py @@ -199,7 +199,7 @@ def compute_tp_splits( # alias the global index to idx idx = global_idx - swiglu_activation = 'swiglu' in str(model_cfg.get('activation', '')).lower() + fast_glu_activation = str(model_cfg.get('activation', '')).lower() in ['fast-geglu', 'fast-swiglu', 'fast-reglu'] if param.shape == partitions[0][idx].shape: split = [partitions[0][idx].data] * tp_size @@ -230,8 +230,8 @@ def compute_tp_splits( for i in range(tp_size): tp_qkv = torch.cat([tp_qkv_splits[item] for item in range(i, tp_size * 2, tp_size)]) split.append(tp_qkv) - elif 'dense_h_to_4h.weight' in param_name and swiglu_activation: - # For Megatron GPT model with Swiglu activation + elif 'dense_h_to_4h.weight' in param_name and fast_glu_activation: + # For Megatron GPT model with Fast Glu activation # Handle gated linear units # concat all the first halves ('W's) and all the second halves ('V's) w_split, k_split = torch.chunk(partitions[0][idx].data, 2, dim=0) @@ -261,7 +261,7 @@ def compute_tp_merge(idx, name, param, partitions_pp, model_cfg): Returns: The concatenated parameter for TP 1 PP 1. """ - swiglu_activation = 'swiglu' in str(model_cfg.get('activation', '')).lower() + fast_glu_activation = str(model_cfg.get('activation', '')).lower() in ['fast-geglu', 'fast-swiglu', 'fast-reglu'] # Logic from original TP rank change if param.shape == partitions_pp[0][idx].shape: @@ -271,8 +271,8 @@ def compute_tp_merge(idx, name, param, partitions_pp, model_cfg): else: concated = torch.cat([partitions_pp[i][idx].data for i in range(len(partitions_pp))], dim=0) - # Logic for Swiglu activation - if 'dense_h_to_4h.weight' in name and swiglu_activation: + # Logic for Fast Glu activation + if 'dense_h_to_4h.weight' in name and fast_glu_activation: # concat all the first halves ('W's) and all the second halves ('V's) wk_splits = [] for tpr in range(len(partitions_pp)): From c4e677a2d7aad47dbade8c3a0e47311a51d03bba Mon Sep 17 00:00:00 2001 From: asfiyab-nvidia <117682710+asfiyab-nvidia@users.noreply.github.com> Date: Sun, 25 Jun 2023 14:19:59 -0700 Subject: [PATCH 060/123] Documentation for ONNX export of Megatron Models (#6914) * add Megatron ONNX export guide Signed-off-by: Asfiya Baig * fix formatting Signed-off-by: Asfiya Baig * include megatron_onnx_export in api.rst Signed-off-by: Asfiya Baig * include megatron_onnx_export in index.rst Signed-off-by: Asfiya Baig * update installation section Signed-off-by: Asfiya Baig * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * revert changes to megatron_ckpt_to_nemo.py Signed-off-by: Asfiya Baig * address comments Signed-off-by: Asfiya Baig --------- Signed-off-by: Asfiya Baig Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper --- docs/source/index.rst | 1 + docs/source/nlp/api.rst | 10 +++++ docs/source/nlp/megatron_onnx_export.rst | 47 ++++++++++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100644 docs/source/nlp/megatron_onnx_export.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index ee1d3fba805a..dcf2ff30e9c5 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -45,6 +45,7 @@ NVIDIA NeMo User Guide nlp/machine_translation/machine_translation nlp/text_normalization/intro nlp/api + nlp/megatron_onnx_export nlp/models diff --git a/docs/source/nlp/api.rst b/docs/source/nlp/api.rst index 7c6971a68d05..0822ade0224c 100755 --- a/docs/source/nlp/api.rst +++ b/docs/source/nlp/api.rst @@ -140,3 +140,13 @@ Datasets .. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.ul2_dataset.UL2Dataset :show-inheritance: +Exportable Model Classes +------------------------- + +.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTExportableModel + :show-inheritance: + +.. toctree:: + :maxdepth: 1 + + megatron_onnx_export \ No newline at end of file diff --git a/docs/source/nlp/megatron_onnx_export.rst b/docs/source/nlp/megatron_onnx_export.rst new file mode 100644 index 000000000000..ee6138d1f912 --- /dev/null +++ b/docs/source/nlp/megatron_onnx_export.rst @@ -0,0 +1,47 @@ +.. _megatron_onnx_export: + +ONNX Export of Megatron Models +==================================== + +This guide demonstrates the usage of the ONNX export functionality for Megatron models. + +Requirements +----------------- +Set up the development environment by launching the latest `NeMo container `_ + +The minimum version requirements for NeMo and TransformerEngine are below + +.. code-block:: bash + + nemo > 1.19 + transformer_engine > 0.10 + +Export to ONNX +----------------- +The export script supports the ONNX export of models with .nemo and .ckpt file extensions. The script also supports the export of the following types of models: GPT, T5, BERT, BART, NMT, RETRO. +Commands for both file formats are discussed in the following sections. The model type used for the examples is GPT. + + +Export using .nemo file +^^^^^^^^^^^^^^^^^^^^^^^^ +A model with .nemo file extension can be exported using the command below + +.. code-block:: bash + + python3 examples/nlp/language_modeling/megatron_export.py \ + model_type=gpt \ + onnx_model_file=gpt_126m.onnx \ + gpt_model_file=gpt_126m.nemo + +Export using .ckpt file +^^^^^^^^^^^^^^^^^^^^^^^^ +A model with .ckpt file extension can be exported using the command below + +.. code-block:: bash + + python3 examples/nlp/language_modeling/megatron_export.py \ + model_type=gpt \ + onnx_model_file=gpt_126m.onnx \ + checkpoint_dir=./gpt_126m/ \ + checkpoint_name=model_weights.ckpt \ + hparams_file=./gpt_126m/hparams.yaml \ No newline at end of file From f344fdbe03d43dc7fecafc5cb3ae6ca937b901b9 Mon Sep 17 00:00:00 2001 From: Greg Heinrich Date: Mon, 26 Jun 2023 16:15:13 +0200 Subject: [PATCH 061/123] FixTextMemMapDataset index file creation in multi-node setup (#6768) * Fix for isolated filesystems in multi-node setting Signed-off-by: Greg Heinrich * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Greg Heinrich Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Micha Livne --- .../language_modeling/text_memmap_dataset.py | 66 +++++++++++++------ tests/collections/nlp/test_mem_map_dataset.py | 12 ++-- 2 files changed, 54 insertions(+), 24 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py index e1a30a3aafb7..05d10b42e115 100644 --- a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py @@ -25,11 +25,11 @@ import torch from nemo.core import Dataset -from nemo.utils import logging +from nemo.utils import AppState, logging -__all__ = ['TextMemMapDataset', 'CSVMemMapDataset', 'build_index_files'] -__idx_version__ = '0.2' # index file version -__idx_suffix__ = 'idx' # index file suffix +__all__ = ["TextMemMapDataset", "CSVMemMapDataset", "build_index_files"] +__idx_version__ = "0.2" # index file version +__idx_suffix__ = "idx" # index file suffix def _build_index_from_memdata(fn, newline_int): @@ -40,7 +40,7 @@ def _build_index_from_memdata(fn, newline_int): Returns a 1D array of ints. """ # use memmap to read file - mdata = np.memmap(fn, dtype=np.uint8, mode='r') + mdata = np.memmap(fn, dtype=np.uint8, mode="r") # find newline positions midx = np.where(mdata == newline_int)[0] midx_dtype = midx.dtype @@ -115,9 +115,10 @@ def __init__( logging.info(f"Building data files") # load all files into memmap - is_ditributed = torch.distributed.is_available() and torch.distributed.is_initialized() + is_distributed = torch.distributed.is_available() and torch.distributed.is_initialized() - if not is_ditributed or (is_ditributed and torch.distributed.get_rank() == 0): + if not is_distributed or (is_distributed and torch.distributed.get_rank() == 0): + # Create index files on global rank 0. build_index_files( dataset_paths, newline_int, @@ -126,14 +127,39 @@ def __init__( index_mapping_dir=index_mapping_dir, ) - if is_ditributed: + if is_distributed: + torch.distributed.barrier() + + if is_distributed and AppState().local_rank == 0: + # If we are in a distributed multi-node set-up and index files are not stored on + # a shared filesystem, then the index files created on global rank 0 are only + # accessible to the workers on that node. + # + # Two cases may occur here: + # + # 1. case of a shared filesystem, or global_rank==0: the index files are present in + # the locally available filesystem, calling build_index_files() again is a no-op. + # 2. case of a non-shared filesystem, and global_rank>0: the index files are not + # present in the locally available filesystem, calling build_index_files() again + # will create them. + # + # Outcome in all cases: all nodes have access to the index files in their filesystem. + build_index_files( + dataset_paths, + newline_int, + workers=self._worker, + build_index_fn=build_index_fn, + index_mapping_dir=index_mapping_dir, + ) + + if is_distributed: torch.distributed.barrier() logging.info(f"Loading data files") start_time = time.time() mdata_midx_list = [self.load_file(fn, index_mapping_dir) for fn in self._files_list] logging.info( - f'Time loading {len(mdata_midx_list)} mem-mapped files: {datetime.timedelta(seconds=time.time() - start_time)}' + f"Time loading {len(mdata_midx_list)} mem-mapped files: {datetime.timedelta(seconds=time.time() - start_time)}" ) logging.info("Computing global indices") @@ -224,34 +250,34 @@ def load_file(self, fn, index_mapping_dir: Optional[str] = None): idx_fn = _index_fn(fn, index_mapping_dir) # create data map - mdata = np.memmap(fn, dtype=np.uint8, mode='r') + mdata = np.memmap(fn, dtype=np.uint8, mode="r") if _index_file_exists(idx_fn): # load index file into memory map - midx = np.load(idx_fn + ".npy", allow_pickle=True, mmap_mode='r') + midx = np.load(idx_fn + ".npy", allow_pickle=True, mmap_mode="r") # test for header if len(midx) < self._header_lines: raise RuntimeError(f"Missing header, expected {self._header_lines} header lines") # load meta info - idx_info_dict = pickle.load(open(idx_fn + ".info", 'rb')) + idx_info_dict = pickle.load(open(idx_fn + ".info", "rb")) # test for mismatch in expected newline_int - if 'newline_int' in idx_info_dict: - newline_int = idx_info_dict['newline_int'] + if "newline_int" in idx_info_dict: + newline_int = idx_info_dict["newline_int"] if self._newline_int != newline_int: logging.warning( f"Mismatch in newline_int, expected = {self._newline_int} but loaded {newline_int}" ) # test for version mismatch (useful to force recreation of index files) - idx_version = idx_info_dict.get('version', '0.0') + idx_version = idx_info_dict.get("version", "0.0") if __idx_version__ != idx_version: raise RuntimeError( f"Version mismatch: Please delete existing '.{__idx_suffix__}' files. Expected version = {__idx_version__}, but file version = {idx_version}. File path = {idx_fn}" ) else: raise ValueError( - f'Memory Map for {fn} is not found, missing one or more of files: {idx_fn}.{{.npy,.info}}' + f"Memory Map for {fn} is not found, missing one or more of files: {idx_fn}.{{.npy,.info}}" ) return (mdata, midx) @@ -271,7 +297,7 @@ def __init__( tokenizer: Optional[Type["TokenizerSpec"]] = None, sort_dataset_paths: Optional[bool] = True, data_col=1, - data_sep=',', + data_sep=",", index_mapping_dir: Optional[str] = None, ): """ @@ -424,7 +450,7 @@ def _build_memmap_index_files(newline_int, build_index_fn, fn, index_mapping_dir def build_index_files( - dataset_paths, newline_int, workers=None, build_index_fn=_build_index_from_memdata, index_mapping_dir: str = None + dataset_paths, newline_int, workers=None, build_index_fn=_build_index_from_memdata, index_mapping_dir: str = None, ): """Auxiliary method to build multiple index files""" if len(dataset_paths) < 1: @@ -438,10 +464,10 @@ def build_index_files( start_time = time.time() with mp.Pool(workers) as p: build_status = p.map( - partial(_build_memmap_index_files, newline_int, build_index_fn, index_mapping_dir=index_mapping_dir), + partial(_build_memmap_index_files, newline_int, build_index_fn, index_mapping_dir=index_mapping_dir,), dataset_paths, ) logging.info( - f'Time building {sum(build_status)} / {len(build_status)} mem-mapped files: {datetime.timedelta(seconds=time.time() - start_time)}' + f"Time building {sum(build_status)} / {len(build_status)} mem-mapped files: {datetime.timedelta(seconds=time.time() - start_time)}" ) diff --git a/tests/collections/nlp/test_mem_map_dataset.py b/tests/collections/nlp/test_mem_map_dataset.py index b60636022e05..1e21b6d270c9 100644 --- a/tests/collections/nlp/test_mem_map_dataset.py +++ b/tests/collections/nlp/test_mem_map_dataset.py @@ -27,13 +27,17 @@ def jsonl_file(tmp_path): file_path = tmp_path / "data.jsonl" # Generate data to write to the JSONL file - data = [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}, {"name": "Bob", "age": 35}] + data = [ + {"name": "John", "age": 30}, + {"name": "Jane", "age": 25}, + {"name": "Bob", "age": 35}, + ] # Write data to the JSONL file with open(file_path, mode="w") as file: for item in data: json.dump(item, file) - file.write('\n') + file.write("\n") # Provide the file path to the test function yield str(file_path) @@ -81,12 +85,12 @@ def test_csv_mem_map_dataset(csv_file): @pytest.mark.parametrize( - "dataset_class", [text_memmap_dataset.JSONLMemMapDataset, text_memmap_dataset.CSVMemMapDataset] + "dataset_class", [text_memmap_dataset.JSONLMemMapDataset, text_memmap_dataset.CSVMemMapDataset], ) @pytest.mark.parametrize("use_alternative_index_mapping_dir", [True, False]) @pytest.mark.parametrize("relative_index_fn", [True, False]) def test_mem_map_dataset_index_mapping_dir( - tmp_path, dataset_class, jsonl_file, use_alternative_index_mapping_dir, relative_index_fn + tmp_path, dataset_class, jsonl_file, use_alternative_index_mapping_dir, relative_index_fn, ): """Test for index_mapping_dir.""" if relative_index_fn: From ef56c97e65921bff3e4544f46a69f3336f3ab99a Mon Sep 17 00:00:00 2001 From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> Date: Mon, 26 Jun 2023 09:23:58 -0700 Subject: [PATCH 062/123] Move model change out of if-branch (#6908) Signed-off-by: Elena Rastorgueva --- nemo/collections/asr/parts/utils/transcribe_utils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index 11e7792cfb21..f8a69fbe817d 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -189,11 +189,6 @@ def setup_model(cfg: DictConfig, map_location: torch.device) -> Tuple[ASRModel, asr_model = imported_class.restore_from( restore_path=cfg.model_path, map_location=map_location, ) # type: ASRModel - if hasattr(cfg, "model_change"): - asr_model.change_attention_model( - self_attention_model=cfg.model_change.conformer.get("self_attention_model", None), - att_context_size=cfg.model_change.conformer.get("att_context_size", None), - ) model_name = os.path.splitext(os.path.basename(cfg.model_path))[0] else: # restore model by name @@ -202,6 +197,12 @@ def setup_model(cfg: DictConfig, map_location: torch.device) -> Tuple[ASRModel, ) # type: ASRModel model_name = cfg.pretrained_name + if hasattr(cfg, "model_change"): + asr_model.change_attention_model( + self_attention_model=cfg.model_change.conformer.get("self_attention_model", None), + att_context_size=cfg.model_change.conformer.get("att_context_size", None), + ) + return asr_model, model_name From e736c863563b86f9be2bc3b9b9359f6ae268ff5a Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Mon, 26 Jun 2023 17:23:34 -0400 Subject: [PATCH 063/123] Update fvad doc (#6920) * update fvad doc Signed-off-by: stevehuang52 * fix typo Signed-off-by: stevehuang52 * update fvad example Signed-off-by: stevehuang52 --------- Signed-off-by: stevehuang52 Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> --- .../conf/vad/frame_vad_infer_postprocess.yaml | 8 ++++---- examples/asr/speech_classification/README.md | 17 +++++++++++++++++ nemo/collections/asr/parts/utils/vad_utils.py | 6 +++--- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml b/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml index 842c04777c72..d759a809ec37 100644 --- a/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml +++ b/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml @@ -21,10 +21,10 @@ vad: postprocessing: onset: 0.3 # onset threshold for detecting the beginning and end of a speech offset: 0.3 # offset threshold for detecting the end of a speech. - pad_onset: 0.5 # adding durations before each speech segment - pad_offset: 0.5 # adding durations after each speech segment - min_duration_on: 0.0 # threshold for short speech deletion - min_duration_off: 0.6 # threshold for short non-speech segment deletion + pad_onset: 0.2 # adding durations before each speech segment + pad_offset: 0.2 # adding durations after each speech segment + min_duration_on: 0.2 # threshold for short speech deletion + min_duration_off: 0.2 # threshold for short non-speech segment deletion filter_speech_first: True prepared_manifest_vad_input: null # if not specify, it will automatically generated be "manifest_vad_input.json" diff --git a/examples/asr/speech_classification/README.md b/examples/asr/speech_classification/README.md index 4fa5d3c4f2b3..bdd3aead8db1 100644 --- a/examples/asr/speech_classification/README.md +++ b/examples/asr/speech_classification/README.md @@ -86,3 +86,20 @@ The manifest json file should have the following format (each line is a Python d {"audio_filepath": "/path/to/audio_file1.wav", "offset": 0, "duration": 10000} {"audio_filepath": "/path/to/audio_file2.wav", "offset": 0, "duration": 10000} ``` + + +## Visualization + +To visualize the VAD outputs, you can use the `nemo.collections.asr.parts.utils.vad_utils.plot_sample_from_rttm` function, which takes an audio file and an RTTM file as input, and plots the audio waveform and the VAD labels. Since the VAD inference script will output a json manifest `manifest_vad_out.json` by default, you can create a Jupyter Notebook with the following script and fill in the paths using the output manifest: +```python +from nemo.collections.asr.parts.utils.vad_utils import plot_sample_from_rttm + +plot_sample_from_rttm( + audio_file="/path/to/audio_file.wav", + rttm_file="/path/to/rttm_file.rttm", + offset=0.0, + duration=1000, + save_path="vad_pred.png" +) +``` + diff --git a/nemo/collections/asr/parts/utils/vad_utils.py b/nemo/collections/asr/parts/utils/vad_utils.py index addf3cae29b7..e4f024d231ad 100644 --- a/nemo/collections/asr/parts/utils/vad_utils.py +++ b/nemo/collections/asr/parts/utils/vad_utils.py @@ -1648,7 +1648,7 @@ def frame_vad_infer_load_manifest(cfg: DictConfig): manifest_orig.append(entry) # always prefer RTTM labels if exist - if "label" not in entry or "rttm_filepath" in entry or "rttm_file" in entry: + if "label" not in entry and ("rttm_filepath" in entry or "rttm_file" in entry): rttm_key = "rttm_filepath" if "rttm_filepath" in entry else "rttm_file" segments = load_speech_segments_from_rttm(entry[rttm_key]) label_str = get_frame_labels( @@ -1661,8 +1661,8 @@ def frame_vad_infer_load_manifest(cfg: DictConfig): key_labels_map[uniq_audio_name] = [float(x) for x in label_str.split()] elif entry.get("label", None) is not None: key_labels_map[uniq_audio_name] = [float(x) for x in entry["label"].split()] - else: - raise ValueError("Must have either `label` or `rttm_filepath` in manifest") + elif cfg.evaluate: + raise ValueError("Must have either `label` or `rttm_filepath` in manifest when evaluate=True") return manifest_orig, key_labels_map, key_rttm_map From 82044837a7bcfa1cfd3a91f6b0014e052bdfcaf0 Mon Sep 17 00:00:00 2001 From: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com> Date: Tue, 27 Jun 2023 06:06:28 +0800 Subject: [PATCH 064/123] Fix flash-attention (#6901) * Set default apply_query_key_layer_scaling to false Signed-off-by: hsiehjackson * Add cross attention test Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: hsiehjackson Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../conf/megatron_gpt_config.yaml | 2 +- .../language_modeling/megatron/gpt_model.py | 2 +- .../nlp/modules/common/megatron/attention.py | 34 ++-- .../modules/common/megatron/fused_softmax.py | 2 +- .../modules/common/megatron/language_model.py | 2 +- .../common/megatron/megatron_decoders.py | 2 +- .../common/megatron/megatron_encoders.py | 2 +- .../modules/common/megatron/transformer.py | 8 +- tests/collections/nlp/test_flash_attention.py | 148 +++++++++++++++--- 9 files changed, 148 insertions(+), 54 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index d1132a32349a..8d7fd09e4307 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -63,7 +63,7 @@ model: attention_dropout: 0.1 # Dropout probability for attention ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null - apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. + apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number. normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm' layernorm_epsilon: 1e-5 do_layer_norm_weight_decay: False # True means weight decay on all params diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py index b43dc98f2fe7..b32bfdb09f20 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py @@ -114,7 +114,7 @@ def __init__( num_layers, num_attention_heads, ffn_hidden_size, - apply_query_key_layer_scaling=True, + apply_query_key_layer_scaling=False, kv_channels=None, num_tokentypes=0, parallel_output=True, diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py index b0d98e0c2fb1..6025b31c0bd5 100644 --- a/nemo/collections/nlp/modules/common/megatron/attention.py +++ b/nemo/collections/nlp/modules/common/megatron/attention.py @@ -109,7 +109,7 @@ def __init__( attention_type=AttnType.self_attn, attn_mask_type=AttnMaskType.padding, precision=16, - apply_query_key_layer_scaling=True, + apply_query_key_layer_scaling=False, kv_channels=None, use_cpu_initialization=False, megatron_amp_O2=False, @@ -564,7 +564,7 @@ def __init__( num_attention_heads, hidden_size, precision=16, - apply_query_key_layer_scaling=True, + apply_query_key_layer_scaling=False, kv_channels=None, use_cpu_initialization=False, megatron_amp_O2=False, @@ -728,7 +728,7 @@ def __init__( attention_type=AttnType.self_attn, attn_mask_type=AttnMaskType.padding, precision=16, - apply_query_key_layer_scaling=True, + apply_query_key_layer_scaling=False, kv_channels=None, masked_softmax_fusion=True, attention_dropout=0.1, @@ -928,7 +928,6 @@ def torch_attention(self, query_layer, key_layer, value_layer, attention_mask, a attention_scores += attention_bias attention_probs = self.scale_mask_softmax(attention_scores, attention_mask) - # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -966,15 +965,6 @@ def flash_attention(self, query_layer, key_layer, value_layer, attention_mask, a else: return self.flash_attention_cuda(query_layer, key_layer, value_layer, attention_mask,) - def reset_is_causal(self, query_length, key_length, causal): - if query_length != key_length: - if query_length == 1: - return False - raise NotImplementedError( - "Flash attention does not support query and key with different number of tokens, unless number of query tokens is 1." - ) - return causal - def flash_attention_cuda(self, query_layer, key_layer, value_layer, attention_mask): batch_size, seqlen, nheads, _ = query_layer.shape @@ -994,9 +984,7 @@ def flash_attention_cuda(self, query_layer, key_layer, value_layer, attention_ma q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(query_layer, attention_mask_q) k, _, cu_seqlens_k, max_seqlen_k = unpad_input(key_layer, attention_mask_kv) v, _, _, _ = unpad_input(value_layer, attention_mask_kv) - causal = self.reset_is_causal( - query_layer.shape[1], key_layer.shape[1], self.attn_mask_type == AttnMaskType.causal - ) + is_causal = self.attn_mask_type == AttnMaskType.causal and query_layer.shape[1] == key_layer.shape[1] context_layer = flash_attn_unpadded_func( q, k, @@ -1006,7 +994,7 @@ def flash_attention_cuda(self, query_layer, key_layer, value_layer, attention_ma max_seqlen_q, max_seqlen_k, dropout_p=self.attention_dropout_p if self.training else 0.0, - causal=causal, + causal=is_causal, ) # [b, sq, np, hn] @@ -1031,13 +1019,13 @@ def flash_attention_triton(self, query_layer, key_layer, value_layer, attention_ attention_mask_q = attention_mask.unsqueeze(1).unsqueeze(3) attention_mask_kv = attention_mask.unsqueeze(1).unsqueeze(2) - attention_bias = attention_bias.masked_fill(~attention_mask_q, torch.finfo(query_layer.dtype).min) - attention_bias = attention_bias.masked_fill(~attention_mask_kv, torch.finfo(query_layer.dtype).min) + if attention_bias.shape[2] == attention_mask_q.shape[2]: + attention_bias = attention_bias.masked_fill(~attention_mask_q, torch.finfo(query_layer.dtype).min) + if attention_bias.shape[3] == attention_mask_kv.shape[3]: + attention_bias = attention_bias.masked_fill(~attention_mask_kv, torch.finfo(query_layer.dtype).min) - causal = self.reset_is_causal( - query_layer.shape[1], key_layer.shape[1], self.attn_mask_type == AttnMaskType.causal - ) - context_layer = flash_attn_func(query_layer, key_layer, value_layer, attention_bias, causal) + is_causal = self.attn_mask_type == AttnMaskType.causal and query_layer.shape[1] == key_layer.shape[1] + context_layer = flash_attn_func(query_layer, key_layer, value_layer, attention_bias, is_causal,) # [b, sq, np, hn] -> [b, np, sq, hn] context_layer = context_layer.permute(0, 2, 1, 3) diff --git a/nemo/collections/nlp/modules/common/megatron/fused_softmax.py b/nemo/collections/nlp/modules/common/megatron/fused_softmax.py index 2c914a67dd12..3da56e597751 100644 --- a/nemo/collections/nlp/modules/common/megatron/fused_softmax.py +++ b/nemo/collections/nlp/modules/common/megatron/fused_softmax.py @@ -53,7 +53,7 @@ def forward_torch_softmax(self, input, mask): probs = torch.nn.Softmax(dim=-1)(mask_output) if mask is not None: all_k_masked = mask.all(axis=-1) - zero_attention_mask = (1.0 - all_k_masked.float())[:, :, :, None] + zero_attention_mask = (1.0 - all_k_masked.type(probs.type()))[:, :, :, None] probs = probs * zero_attention_mask if self.input_in_float16 and self.softmax_in_fp32: diff --git a/nemo/collections/nlp/modules/common/megatron/language_model.py b/nemo/collections/nlp/modules/common/megatron/language_model.py index a3fa3fd6d2be..e6305e563549 100755 --- a/nemo/collections/nlp/modules/common/megatron/language_model.py +++ b/nemo/collections/nlp/modules/common/megatron/language_model.py @@ -70,7 +70,7 @@ def get_language_model( vocab_size, num_attention_heads, encoder_attn_mask_type, - apply_query_key_layer_scaling=True, + apply_query_key_layer_scaling=False, kv_channels=None, init_method=None, scaled_init_method=None, diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py index ca2000842fe4..20f25a25179a 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py @@ -44,7 +44,7 @@ def get_decoder_model( ffn_hidden_size, num_layers, num_attention_heads, - apply_query_key_layer_scaling=True, + apply_query_key_layer_scaling=False, kv_channels=None, init_method=None, scaled_init_method=None, diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py index 9f5d917e2077..b98aa26b1b23 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py @@ -45,7 +45,7 @@ def get_encoder_model( ffn_hidden_size, num_layers, num_attention_heads, - apply_query_key_layer_scaling=True, + apply_query_key_layer_scaling=False, kv_channels=None, init_method=None, scaled_init_method=None, diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index ea01acd14a23..258e42ce9694 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -141,7 +141,7 @@ def __init__( self_attn_mask_type=AttnMaskType.padding, fp32_residual_connection=False, precision=16, - apply_query_key_layer_scaling=True, + apply_query_key_layer_scaling=False, kv_channels=None, layernorm_epsilon=1e-5, hidden_dropout=0.1, @@ -659,7 +659,7 @@ def __init__( self_attn_mask_type=AttnMaskType.padding, fp32_residual_connection=False, precision=16, - apply_query_key_layer_scaling=True, + apply_query_key_layer_scaling=False, kv_channels=None, layernorm_epsilon=1e-5, hidden_dropout=0.1, @@ -804,7 +804,7 @@ def __init__( params_dtype: torch.dtype = torch.float32, get_rng_state_tracker: Optional[Callable] = None, fuse_wgrad_accumulation: bool = False, - apply_query_key_layer_scaling: bool = True, + apply_query_key_layer_scaling: bool = False, attention_softmax_in_fp32: bool = False, seq_length: Optional[int] = None, micro_batch_size: Optional[int] = None, @@ -895,7 +895,7 @@ def __init__( hidden_size, ffn_hidden_size, num_attention_heads, - apply_query_key_layer_scaling=True, + apply_query_key_layer_scaling=False, kv_channels=None, layer_type=LayerType.encoder, # it can be a list of types or single type self_attn_mask_type=AttnMaskType.padding, diff --git a/tests/collections/nlp/test_flash_attention.py b/tests/collections/nlp/test_flash_attention.py index cead91ff312a..727742fdffb5 100644 --- a/tests/collections/nlp/test_flash_attention.py +++ b/tests/collections/nlp/test_flash_attention.py @@ -87,8 +87,10 @@ def setup_class(cls): def cfg(self): cfg = { 'bz': random.randint(1, 7), - 'sl': random.randint(1, 7), + 'sq': random.randint(2, 7), + 'sk': random.randint(2, 7), 'head': random.randint(1, 7), + 'layer_number': random.randint(1, 7), 'device': torch.cuda.current_device(), } # flash attention requires head dimensions are multiples of 8 @@ -99,9 +101,10 @@ def cfg(self): @pytest.mark.skipif(not HAVE_FA, reason="flash-attention is not installed") @pytest.mark.unit - def test_flash_attention(self, cfg): + def test_flash_self_attention(self, cfg): device = cfg['device'] - bz, sl, np, h = cfg['bz'], cfg['sl'], cfg['head'], cfg['hidden'] + layer_number = cfg['layer_number'] + bz, sl, np, h = cfg['bz'], cfg['sq'], cfg['head'], cfg['hidden'] hn = h // np q = torch.rand(sl, bz, np, hn, device=device).half() @@ -122,7 +125,7 @@ def test_flash_attention(self, cfg): # Non-causal attention = CoreAttention( - layer_number=1, + layer_number=layer_number, num_attention_heads=np, hidden_size=h, attn_mask_type=AttnMaskType.padding, @@ -130,7 +133,7 @@ def test_flash_attention(self, cfg): ) attention_fa = CoreAttention( - layer_number=1, + layer_number=layer_number, num_attention_heads=np, hidden_size=h, attn_mask_type=AttnMaskType.padding, @@ -140,21 +143,22 @@ def test_flash_attention(self, cfg): out = attention(q, k, v, attention_mask_padding_3d) out_fa = attention_fa(q, k, v, attention_mask_padding_3d) - assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + torch.testing.assert_close(out, out_fa) out_fa = attention_fa(q, k, v, attention_mask_2d) - assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + torch.testing.assert_close(out, out_fa) # Causal attention = CoreAttention( - layer_number=1, + layer_number=layer_number, num_attention_heads=np, hidden_size=h, attn_mask_type=AttnMaskType.causal, attention_dropout=0.0, + apply_query_key_layer_scaling=False, ) attention_fa = CoreAttention( - layer_number=1, + layer_number=layer_number, num_attention_heads=np, hidden_size=h, attn_mask_type=AttnMaskType.causal, @@ -164,9 +168,55 @@ def test_flash_attention(self, cfg): out = attention(q, k, v, attention_mask_causal_3d) out_fa = attention_fa(q, k, v, attention_mask_causal_3d) - assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + torch.testing.assert_close(out, out_fa) out_fa = attention_fa(q, k, v, attention_mask_2d) - assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + torch.testing.assert_close(out, out_fa) + + @pytest.mark.skipif(not HAVE_FA, reason="flash-attention is not installed") + @pytest.mark.unit + def test_flash_cross_attention(self, cfg): + device = cfg['device'] + layer_number = cfg['layer_number'] + bz, sq, sk, np, h = cfg['bz'], cfg['sq'], cfg['sk'], cfg['head'], cfg['hidden'] + hn = h // np + + q = torch.rand(sq, bz, np, hn, device=device).half() + k = torch.rand(sk, bz, np, hn, device=device).half() + v = torch.rand(sk, bz, np, hn, device=device).half() + + attention_mask_2d_q = torch.arange(sq, device=device).unsqueeze(0) < torch.randint( + 1, sq, (bz,), device=device + ).unsqueeze(1) + + attention_mask_2d_k = torch.arange(sk, device=device).unsqueeze(0) < torch.randint( + 1, sk, (bz,), device=device + ).unsqueeze(1) + + attention_mask_padding_3d = build_attention_mask_3d( + source_mask=attention_mask_2d_q, target_mask=attention_mask_2d_k, attn_mask_type=AttnMaskType.padding + ).unsqueeze(1) + + attention = CoreAttention( + layer_number=layer_number, + num_attention_heads=np, + hidden_size=h, + attn_mask_type=AttnMaskType.padding, + attention_dropout=0.0, + apply_query_key_layer_scaling=False, + ) + + attention_fa = CoreAttention( + layer_number=layer_number, + num_attention_heads=np, + hidden_size=h, + attn_mask_type=AttnMaskType.padding, + attention_dropout=0.0, + use_flash_attention=True, + ) + + out = attention(q, k, v, attention_mask_padding_3d) + out_fa = attention_fa(q, k, v, attention_mask_padding_3d) + torch.testing.assert_close(out, out_fa) @pytest.mark.skipif(not HAVE_FA, reason="flash-attention is not installed") @pytest.mark.skipif(not HAVE_TRITON, reason="triton is not installed") @@ -175,9 +225,10 @@ def test_flash_attention(self, cfg): reason="should only run on AMPERE GPU. Please see https://github.com/HazyResearch/flash-attention/issues/245", ) @pytest.mark.unit - def test_flash_attention_triton(self, cfg): + def test_flash_self_attention_triton(self, cfg): device = cfg['device'] - bz, sl, np, h = cfg['bz'], cfg['sl'], cfg['head'], cfg['hidden'] + layer_number = cfg['layer_number'] + bz, sl, np, h = cfg['bz'], cfg['sq'], cfg['head'], cfg['hidden'] hn = h // np q = torch.rand(sl, bz, np, hn, device=device).half() @@ -200,15 +251,16 @@ def test_flash_attention_triton(self, cfg): # Non-causal attention = CoreAttention( - layer_number=1, + layer_number=layer_number, num_attention_heads=np, hidden_size=h, attn_mask_type=AttnMaskType.padding, attention_dropout=0.0, + apply_query_key_layer_scaling=False, ) attention_fa = CoreAttention( - layer_number=1, + layer_number=layer_number, num_attention_heads=np, hidden_size=h, attn_mask_type=AttnMaskType.padding, @@ -218,21 +270,22 @@ def test_flash_attention_triton(self, cfg): out = attention(q, k, v, attention_mask_padding_3d, relative_position_bias=attention_bias) out_fa = attention_fa(q, k, v, attention_mask_padding_3d, relative_position_bias=attention_bias) - assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + torch.testing.assert_close(out, out_fa, rtol=1e-3, atol=1e-3) out_fa = attention_fa(q, k, v, attention_mask_2d, relative_position_bias=attention_bias) - assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + torch.testing.assert_close(out, out_fa, rtol=1e-3, atol=1e-3) # Causal attention = CoreAttention( - layer_number=1, + layer_number=layer_number, num_attention_heads=np, hidden_size=h, attn_mask_type=AttnMaskType.causal, attention_dropout=0.0, + apply_query_key_layer_scaling=False, ) attention_fa = CoreAttention( - layer_number=1, + layer_number=layer_number, num_attention_heads=np, hidden_size=h, attn_mask_type=AttnMaskType.causal, @@ -242,6 +295,59 @@ def test_flash_attention_triton(self, cfg): out = attention(q, k, v, attention_mask_causal_3d, relative_position_bias=attention_bias) out_fa = attention_fa(q, k, v, attention_mask_causal_3d, relative_position_bias=attention_bias) - assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + torch.testing.assert_close(out, out_fa, rtol=1e-3, atol=1e-3) out_fa = attention_fa(q, k, v, attention_mask_2d, relative_position_bias=attention_bias) - assert torch.allclose(out, out_fa, rtol=1e-3, atol=1e-3) + torch.testing.assert_close(out, out_fa, rtol=1e-3, atol=1e-3) + + @pytest.mark.skipif(not HAVE_FA, reason="flash-attention is not installed") + @pytest.mark.skipif(not HAVE_TRITON, reason="triton is not installed") + @pytest.mark.skipif( + not HAVE_AMPERE_GPU(), + reason="should only run on AMPERE GPU. Please see https://github.com/HazyResearch/flash-attention/issues/245", + ) + @pytest.mark.unit + def test_flash_cross_attention_triton(self, cfg): + device = cfg['device'] + layer_number = cfg['layer_number'] + bz, sq, sk, np, h = cfg['bz'], cfg['sq'], cfg['sk'], cfg['head'], cfg['hidden'] + hn = h // np + + q = torch.rand(sq, bz, np, hn, device=device).half() + k = torch.rand(sk, bz, np, hn, device=device).half() + v = torch.rand(sk, bz, np, hn, device=device).half() + + attention_mask_2d_q = torch.arange(sq, device=device).unsqueeze(0) < torch.randint( + 1, sq, (bz,), device=device + ).unsqueeze(1) + + attention_mask_2d_k = torch.arange(sk, device=device).unsqueeze(0) < torch.randint( + 1, sk, (bz,), device=device + ).unsqueeze(1) + + attention_mask_padding_3d = build_attention_mask_3d( + source_mask=attention_mask_2d_q, target_mask=attention_mask_2d_k, attn_mask_type=AttnMaskType.padding + ).unsqueeze(1) + + attention_bias = torch.rand(bz, np, sq, sk, device=device) + + attention = CoreAttention( + layer_number=layer_number, + num_attention_heads=np, + hidden_size=h, + attn_mask_type=AttnMaskType.padding, + attention_dropout=0.0, + apply_query_key_layer_scaling=False, + ) + + attention_fa = CoreAttention( + layer_number=layer_number, + num_attention_heads=np, + hidden_size=h, + attn_mask_type=AttnMaskType.padding, + attention_dropout=0.0, + use_flash_attention=True, + ) + + out = attention(q, k, v, attention_mask_padding_3d, relative_position_bias=attention_bias) + out_fa = attention_fa(q, k, v, attention_mask_padding_3d, relative_position_bias=attention_bias) + torch.testing.assert_close(out, out_fa, rtol=1e-3, atol=1e-3) From 7e3739bff68d98b7fbe280c36ad64d23bca98a34 Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Tue, 27 Jun 2023 15:16:34 -0700 Subject: [PATCH 065/123] ptuning oom fix (#6916) * oom wip Signed-off-by: arendu * minor Signed-off-by: arendu * comments Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../megatron/gpt_prompt_learning_dataset.py | 10 +++++++++- .../language_modeling/megatron_gpt_peft_models.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py index 15edc673b7cc..4b1b4f61d439 100755 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py @@ -327,6 +327,9 @@ def __len__(self): def __getitem__(self, idx): return self.examples[idx] + def _ceil_to_nearest(self, n, m): + return (n + m - 1) // m * m + def collate_fn(self, batch, tp_workers=0): """ Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch """ taskname_ids, input_ids, answer_starts = zip(*batch) @@ -350,11 +353,16 @@ def collate_fn(self, batch, tp_workers=0): else: resi_padding = 0 batch_max += resi_padding + ceil_batch_max = self._ceil_to_nearest( + batch_max, 8 + ) # @adithyare this padding does not conflict with the tp_workers padding above + # since tp_workers is always a multiple of 2. the padding to multiple of 8 is to ensure an mem-optimized softmax is used. + batch_max = ceil_batch_max + 1 input_ids, loss_mask = self.pad_batch_and_build_loss_mask(input_ids, batch_max, answer_starts) # Should be a label for every token in batch, label is the next token labels = input_ids[:, 1:].contiguous() input_ids = input_ids[:, :-1].contiguous() - batch_max -= 1 + batch_max -= 1 # @adithyare I *think* this negatition is done to account for the above 2 lines which removes one item from the input_ids seq. # Loss mask should align with labels loss_mask = loss_mask[:, 1:].contiguous() diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py index f1f44e31e175..73579114234d 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py @@ -346,7 +346,7 @@ def __init__( AdapterName.LORA_KQV_ADAPTER, ] lora_cfg = cfg.peft.lora_tuning - if cfg.kv_channels is None: + if cfg.get("kv_channels", None) is None: assert ( cfg.hidden_size % cfg.num_attention_heads == 0 ), 'hidden_size must be divisible by num_attention_heads if kv_channels is None' From 350b2a2ddb221b63be3d406b5907b614d32a6e8d Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Wed, 28 Jun 2023 01:29:25 +0300 Subject: [PATCH 066/123] add rampup bs assertion (#6927) * added assertion Signed-off-by: Dmytro Pykhtar * added assertion Signed-off-by: Dmytro Pykhtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Dmytro Pykhtar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../nlp/models/language_modeling/megatron_gpt_model.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index c4bfdbbad143..84caed6c111e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1001,6 +1001,11 @@ def setup(self, stage=None): self.init_global_step = self.trainer.global_step if self.rampup_batch_size: + optimizer = self.cfg.optim.get('name', None) + assert ( + optimizer == 'fused_adam' + ), f'{optimizer} optimizer is not supported yet with rampup batch size. Please, use fused_adam optimizer instead.' + num_microbatch_calculator = apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR num_microbatch_calculator.update(self.init_consumed_samples, consistency_check=False) self.prev_consumed_samples = self.init_consumed_samples From 92c4a2a74d51f07c1dfbb6664c22c0dfa5f5a71e Mon Sep 17 00:00:00 2001 From: trias702 <25867060+trias702@users.noreply.github.com> Date: Tue, 27 Jun 2023 18:10:47 -0500 Subject: [PATCH 067/123] Online Code Switching Dataset for ASR (#6579) * Initial commit of online code switched dataset Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updated tests for new argument added to BPE classes in audio_to_text.py Signed-off-by: Daniel Egert * Updated logic to catch bad audios with all zeros Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Added method docstrings and convert lang_probs to be an optional dict Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updated lang_probs docstring to correct type Signed-off-by: Daniel Egert * Added final batch of requested changes and docs Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Made changes for final release candidate test Signed-off-by: Daniel Egert * Fixed random tabs and changed some docstrings Signed-off-by: Daniel Egert * Changed input types slightly Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * import guard soundfile due to CI failure in test-nlp-imports Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixed one last issue with dataloader null checks Signed-off-by: Daniel Egert --------- Signed-off-by: Daniel Egert Signed-off-by: trias702 <25867060+trias702@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/source/asr/configs.rst | 39 ++ .../asr/data/audio_to_text_dataset.py | 144 ++++++- .../asr/models/classification_models.py | 6 +- nemo/collections/asr/models/ctc_bpe_models.py | 2 +- nemo/collections/asr/models/ctc_models.py | 8 +- .../asr/models/hybrid_rnnt_ctc_bpe_models.py | 2 +- nemo/collections/asr/models/label_models.py | 6 +- .../collections/asr/models/rnnt_bpe_models.py | 2 +- nemo/collections/asr/models/rnnt_models.py | 8 +- nemo/collections/asr/models/slu_models.py | 6 +- nemo/collections/asr/models/ssl_models.py | 12 +- nemo/collections/common/data/__init__.py | 2 +- nemo/collections/common/data/dataset.py | 375 +++++++++++++++++- .../common/parts/preprocessing/collections.py | 3 + .../ngram_lm/create_lexicon_from_arpa.py | 4 +- 15 files changed, 599 insertions(+), 20 deletions(-) diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst index 120969ee9dfa..f9a4ea9970b1 100644 --- a/docs/source/asr/configs.rst +++ b/docs/source/asr/configs.rst @@ -237,6 +237,45 @@ For example, a decoder config corresponding to a sub-word tokenization model sho vocabulary: [] # filled with vocabulary from tokenizer at runtime +On-the-fly Code Switching +------------------------- + +Nemo supports creating code-switched synthetic utterances on-the-fly during training/validation/testing. This allows you to create ASR models which +support intra-utterance code switching. If you have Nemo formatted audio data on disk (either JSON manifests or tarred audio data), you +can easily mix as many of these audio sources together as desired by adding some extra parameters to your `train_ds`, `validation_ds`, and `test_ds`. + +Please note that this allows you to mix any kind of audio sources together to create synthetic utterances which sample from all sources. The most +common use case for this is blending different languages together to create a multilingual code-switched model, but you can also blend +together different audio sources from the same languages (or language families), to create noise robust data, or mix fast and slow speech from the +same language. + +For multilingual code-switched models, we recommend using AggTokenizer for your Tokenizer if mixing different languages. + +The following example shows how to mix 3 different languages: English (en), German (de), and Japanese (ja) added to the `train_ds` model block, however +you can add similar logic to your `validation_ds` and `test_ds` blocks for on-the-fly code-switched validation and test data too. This example mixes +together 3 languages, but you can use as many as you want. However, be advised that the more languages you add, the higher your `min_duration` and `max_duration` +need to be set to ensure all languages are sampled into each synthetic utterance, and setting these hyperparameters higher will use more VRAM per mini-batch during +training and evaluation. + +.. code-block:: yaml + + model: + train_ds: + manifest_filepath: [/path/to/EN/tarred_manifest.json, /path/to/DE/tarred_manifest.json, /path/to/JA/tarred_manifest.json] + tarred_audio_filepaths: ['/path/to/EN/tars/audio__OP_0..511_CL_.tar', '/path/to/DE/tars/audio__OP_0..1023_CL_.tar', '/path/to/JA/tars/audio__OP_0..2047_CL_.tar'] + is_code_switched: true + is_tarred: true + shuffle: true + code_switched: # add this block for code-switching + min_duration: 12 # the minimum number of seconds for each synthetic code-switched utterance + max_duration: 20 # the maximum number of seconds for each synthetic code-switched utterance + min_monolingual: 0.3 # the minimum percentage of utterances which will be pure monolingual (0.3 = 30%) + probs: [0.25, 0.5, 0.25] # the probability to sample each language (matches order of `language` above) if not provided, assumes uniform distribution + force_monochannel: true # if your source data is multi-channel, then setting this to True will force the synthetic utterances to be mono-channel + sampling_scales: 0.75 # allows you to down/up sample individual languages. Can set this as an array for individual languages, or a scalar for all languages + seed: 123 # add a seed for replicability in future runs (highly useful for `validation_ds` and `test_ds`) + + Model Architecture Configurations --------------------------------- diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py index d5dcc8be4847..3234b617cc9c 100644 --- a/nemo/collections/asr/data/audio_to_text_dataset.py +++ b/nemo/collections/asr/data/audio_to_text_dataset.py @@ -19,14 +19,14 @@ from typing import Any, List, Optional, Union import torch -from omegaconf import DictConfig, open_dict +from omegaconf import DictConfig, OmegaConf, open_dict from omegaconf.listconfig import ListConfig from pytorch_lightning.callbacks import BasePredictionWriter from torch.utils.data import ChainDataset from nemo.collections.asr.data import audio_to_text, audio_to_text_dali from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations -from nemo.collections.common.data.dataset import ConcatDataset +from nemo.collections.common.data.dataset import CodeSwitchedDataset, ConcatDataset from nemo.utils import logging @@ -398,6 +398,88 @@ def get_tarred_dataset( return get_chain_dataset(datasets=datasets, ds_config=config, rank=global_rank) +def get_code_switched_dataset( + config: dict, + shuffle_n: int, + global_rank: int, + world_size: int, + tokenizer: Optional['TokenizerSpec'] = None, + augmentor: Optional['AudioAugmentor'] = None, +) -> CodeSwitchedDataset: + + if 'manifest_filepath' not in config: + raise ValueError("`manifest_filepath` must be provided in the dataset config if `is_code_switched=True`") + if 'code_switched' not in config: + raise ValueError("`code_switched` param group must be in the dataset config if `is_code_switched=True`") + + manifest_filepaths = config['manifest_filepath'] + tarred_audio_filepaths = config.get('tarred_audio_filepaths', None) + + cs_config = OmegaConf.to_container(config['code_switched']) + + # needed to support validation Datasets that arrive here as + # [[dataset1,dataset2]] otherwise ModelPT would interfere + if len(manifest_filepaths) == 1 and not isinstance(manifest_filepaths[0], str): + manifest_filepaths = config['manifest_filepath'][0] + if tarred_audio_filepaths is None: + tarred_audio_filepaths = [None] * len(manifest_filepaths) + + if len(manifest_filepaths) != len(tarred_audio_filepaths): + raise ValueError( + f"manifest_filepaths (length={len(manifest_filepaths)}) and tarred_audio_filepaths (length={len(tarred_audio_filepaths)}) need to have the same number of items." + ) + + datasets = [] + for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate( + zip(tarred_audio_filepaths, manifest_filepaths) + ): + conf = copy.deepcopy(config) + conf['manifest_filepath'] = manifest_filepath + with open_dict(conf): + conf['tarred_audio_filepaths'] = tarred_audio_filepath + if tarred_audio_filepath is None or len(tarred_audio_filepath) == 0: + if tokenizer is None: + dataset = get_char_dataset(config=conf, augmentor=None) + else: + dataset = get_bpe_dataset(config=conf, tokenizer=tokenizer, augmentor=None) + else: + dataset = get_tarred_dataset( + config=conf, + tokenizer=tokenizer, + shuffle_n=shuffle_n, + global_rank=global_rank, + world_size=world_size, + augmentor=None, + ) + datasets.append(dataset) + + config = OmegaConf.to_container(config) + + dataset = CodeSwitchedDataset( + datasets, + shuffle=cs_config.get('shuffle', True), + min_duration=cs_config.get('min_duration', 4), + max_duration=cs_config.get('max_duration', 20), + min_monolingual=cs_config.get('min_monolingual', 0.3), + lang_probs=cs_config.get('probs', None), + db_norm=cs_config.get('db_norm', -25.0), + pause_start=cs_config.get('pause_start', 0), + pause_join=cs_config.get('pause_join', 0), + pause_end=cs_config.get('pause_end', 0), + sampling_scales=cs_config.get('sampling_scales', None), + seed=cs_config.get('seed', None), + global_rank=global_rank, + world_size=world_size, + pure_random=cs_config.get('pure_random', False), + force_monochannel=cs_config.get('force_monochannel', True), + infinity_mode=cs_config.get('infinity_mode', False), + sample_rate=config['sample_rate'], + augmentor=augmentor, + ) + + return dataset + + def get_dali_char_dataset( config: dict, shuffle: bool, @@ -546,8 +628,35 @@ def get_audio_to_text_char_dataset_from_config( ) return dataset + # Instantiate a code-switched dataset if config is present + if config.get('is_code_switched', False): + if 'manifest_filepath' in config and config['manifest_filepath'] is None: + logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}") + return None + if not ('code_switched' in config and config['code_switched'] is not None): + logging.warning( + f"Code switched dataset requires `*_ds.code_switched.*` dict but it was not provided. Config: {config}" + ) + return None + if ( + ('probs' in config['code_switched']) + and (config['code_switched']['probs'] is not None) + and (not isclose(sum(config['code_switched']['probs']), 1, abs_tol=1e-6)) + ): + logging.warning(f"`.code_switched.probs` need to sum to 1. Config: {config['code_switched']}") + return None + + shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0 + dataset = get_code_switched_dataset( + config=config, + shuffle_n=shuffle_n, + global_rank=global_rank, + world_size=world_size, + tokenizer=None, + augmentor=augmentor, + ) # Instantiate tarred dataset loader or normal dataset loader - if config.get('is_tarred', False): + elif config.get('is_tarred', False): if ('tarred_audio_filepaths' in config and config['tarred_audio_filepaths'] is None) or ( 'manifest_filepath' in config and config['manifest_filepath'] is None ): @@ -645,8 +754,35 @@ def get_audio_to_text_bpe_dataset_from_config( ) return dataset + # Instantiate a code-switched dataset if config is present + if config.get('is_code_switched', False): + if 'manifest_filepath' in config and config['manifest_filepath'] is None: + logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}") + return None + if not ('code_switched' in config and config['code_switched'] is not None): + logging.warning( + f"Code switched dataset requires `*_ds.code_switched.*` dict but it was not provided. Config: {config}" + ) + return None + if ( + ('probs' in config['code_switched']) + and (config['code_switched']['probs'] is not None) + and (not isclose(sum(config['code_switched']['probs']), 1, abs_tol=1e-6)) + ): + logging.warning(f"`.code_switched.probs` need to sum to 1. Config: {config['code_switched']}") + return None + + shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0 + dataset = get_code_switched_dataset( + config=config, + shuffle_n=shuffle_n, + global_rank=global_rank, + world_size=world_size, + tokenizer=tokenizer, + augmentor=augmentor, + ) # Instantiate tarred dataset loader or normal dataset loader - if config.get('is_tarred', False): + elif config.get('is_tarred', False): if ('tarred_audio_filepaths' in config and config['tarred_audio_filepaths'] is None) or ( 'manifest_filepath' in config and config['manifest_filepath'] is None ): diff --git a/nemo/collections/asr/models/classification_models.py b/nemo/collections/asr/models/classification_models.py index fb0ee82132a1..432674225f5a 100644 --- a/nemo/collections/asr/models/classification_models.py +++ b/nemo/collections/asr/models/classification_models.py @@ -174,7 +174,11 @@ def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict # Need to set this because if using an IterableDataset, the length of the dataloader is the total number # of samples rather than the number of batches, and this messes up the tqdm progress bar. # So we set the number of steps manually (to the correct number) to fix this. - if 'is_tarred' in train_data_config and train_data_config['is_tarred']: + if ( + self._train_dl is not None + and hasattr(self._train_dl, 'dataset') + and isinstance(self._train_dl.dataset, torch.utils.data.IterableDataset) + ): # We also need to check if limit_train_batches is already set. # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py index 7d3b236b2bab..04547b816fe8 100644 --- a/nemo/collections/asr/models/ctc_bpe_models.py +++ b/nemo/collections/asr/models/ctc_bpe_models.py @@ -106,7 +106,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): return dataset shuffle = config['shuffle'] - if config.get('is_tarred', False): + if isinstance(dataset, torch.utils.data.IterableDataset): shuffle = False if hasattr(dataset, 'collate_fn'): diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py index 1446e1ce871f..d995544513de 100644 --- a/nemo/collections/asr/models/ctc_models.py +++ b/nemo/collections/asr/models/ctc_models.py @@ -365,7 +365,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): return dataset shuffle = config['shuffle'] - if config.get('is_tarred', False): + if isinstance(dataset, torch.utils.data.IterableDataset): shuffle = False if hasattr(dataset, 'collate_fn'): @@ -413,7 +413,11 @@ def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict # Need to set this because if using an IterableDataset, the length of the dataloader is the total number # of samples rather than the number of batches, and this messes up the tqdm progress bar. # So we set the number of steps manually (to the correct number) to fix this. - if 'is_tarred' in train_data_config and train_data_config['is_tarred']: + if ( + self._train_dl is not None + and hasattr(self._train_dl, 'dataset') + and isinstance(self._train_dl.dataset, torch.utils.data.IterableDataset) + ): # We also need to check if limit_train_batches is already set. # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index b88669a1fbc0..6637486f18dc 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -144,7 +144,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): return dataset shuffle = config['shuffle'] - if config.get('is_tarred', False): + if isinstance(dataset, torch.utils.data.IterableDataset): shuffle = False if hasattr(dataset, 'collate_fn'): diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py index cc789dacff11..1a284aca609d 100644 --- a/nemo/collections/asr/models/label_models.py +++ b/nemo/collections/asr/models/label_models.py @@ -275,7 +275,11 @@ def setup_training_data(self, train_data_layer_config: Optional[Union[DictConfig # Need to set this because if using an IterableDataset, the length of the dataloader is the total number # of samples rather than the number of batches, and this messes up the tqdm progress bar. # So we set the number of steps manually (to the correct number) to fix this. - if 'is_tarred' in train_data_layer_config and train_data_layer_config['is_tarred']: + if ( + self._train_dl is not None + and hasattr(self._train_dl, 'dataset') + and isinstance(self._train_dl.dataset, torch.utils.data.IterableDataset) + ): # We also need to check if limit_train_batches is already set. # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py index 9ed38a376103..aa5486f25811 100644 --- a/nemo/collections/asr/models/rnnt_bpe_models.py +++ b/nemo/collections/asr/models/rnnt_bpe_models.py @@ -494,7 +494,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): return dataset shuffle = config['shuffle'] - if config.get('is_tarred', False): + if isinstance(dataset, torch.utils.data.IterableDataset): shuffle = False if hasattr(dataset, 'collate_fn'): diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index eec663813ca8..92bb04fd2a3e 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -475,7 +475,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): return dataset shuffle = config['shuffle'] - if config.get('is_tarred', False): + if isinstance(dataset, torch.utils.data.IterableDataset): shuffle = False if hasattr(dataset, 'collate_fn'): @@ -523,7 +523,11 @@ def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict # Need to set this because if using an IterableDataset, the length of the dataloader is the total number # of samples rather than the number of batches, and this messes up the tqdm progress bar. # So we set the number of steps manually (to the correct number) to fix this. - if 'is_tarred' in train_data_config and train_data_config['is_tarred']: + if ( + self._train_dl is not None + and hasattr(self._train_dl, 'dataset') + and isinstance(self._train_dl.dataset, torch.utils.data.IterableDataset) + ): # We also need to check if limit_train_batches is already set. # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). diff --git a/nemo/collections/asr/models/slu_models.py b/nemo/collections/asr/models/slu_models.py index 2062397c511c..6df907334662 100644 --- a/nemo/collections/asr/models/slu_models.py +++ b/nemo/collections/asr/models/slu_models.py @@ -436,7 +436,11 @@ def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict # Need to set this because if using an IterableDataset, the length of the dataloader is the total number # of samples rather than the number of batches, and this messes up the tqdm progress bar. # So we set the number of steps manually (to the correct number) to fix this. - if 'is_tarred' in train_data_config and train_data_config['is_tarred']: + if ( + self._train_dl is not None + and hasattr(self._train_dl, 'dataset') + and isinstance(self._train_dl.dataset, torch.utils.data.IterableDataset) + ): # We also need to check if limit_train_batches is already set. # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). diff --git a/nemo/collections/asr/models/ssl_models.py b/nemo/collections/asr/models/ssl_models.py index dee2559364d0..8de713ca948d 100644 --- a/nemo/collections/asr/models/ssl_models.py +++ b/nemo/collections/asr/models/ssl_models.py @@ -234,7 +234,11 @@ def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict # Need to set this because if using an IterableDataset, the length of the dataloader is the total number # of samples rather than the number of batches, and this messes up the tqdm progress bar. # So we set the number of steps manually (to the correct number) to fix this. - if 'is_tarred' in train_data_config and train_data_config['is_tarred']: + if ( + self._train_dl is not None + and hasattr(self._train_dl, 'dataset') + and isinstance(self._train_dl.dataset, torch.utils.data.IterableDataset) + ): # We also need to check if limit_train_batches is already set. # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). @@ -270,7 +274,11 @@ def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict # Need to set this because if using an IterableDataset, the length of the dataloader is the total number # of samples rather than the number of batches, and this messes up the tqdm progress bar. # So we set the number of steps manually (to the correct number) to fix this. - if 'is_tarred' in val_data_config and val_data_config['is_tarred']: + if ( + self._validation_dl is not None + and hasattr(self._validation_dl, 'dataset') + and isinstance(self._validation_dl.dataset, torch.utils.data.IterableDataset) + ): # We also need to check if limit_train_batches is already set. # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). diff --git a/nemo/collections/common/data/__init__.py b/nemo/collections/common/data/__init__.py index afb12338e548..ecc67ef05ea5 100644 --- a/nemo/collections/common/data/__init__.py +++ b/nemo/collections/common/data/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo.collections.common.data.dataset import ConcatDataset, ConcatMapDataset +from nemo.collections.common.data.dataset import CodeSwitchedDataset, ConcatDataset, ConcatMapDataset diff --git a/nemo/collections/common/data/dataset.py b/nemo/collections/common/data/dataset.py index 030e997802bc..5b4fba5ef24a 100644 --- a/nemo/collections/common/data/dataset.py +++ b/nemo/collections/common/data/dataset.py @@ -12,14 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +import io import logging -from typing import Any, List, Optional, Tuple +from typing import Any, List, Optional, Tuple, Union import numpy as np +import torch import torch.utils.data as pt_data from torch.utils.data import Dataset, IterableDataset -__all__ = ['ConcatDataset', 'ConcatMapDataset'] +__all__ = ['ConcatDataset', 'ConcatMapDataset', 'CodeSwitchedDataset'] class ConcatDataset(IterableDataset): @@ -286,3 +288,372 @@ def __len__(self): def __getitem__(self, idx): dataset_id, dataset_index = self.indices[idx] return self.datasets[dataset_id][dataset_index] + + +class CodeSwitchedDataset(IterableDataset): + """ + A dataset that accepts as argument multiple sub-datasets (usually from different languages, but that's not required) and then + samples from them in order to create synthetic code-switched samples of up to N different sub-datasets + Args: + datasets (list): A list of datasets + lang_probs (list): A list of probabilities (which must sum to 1) corresponding to the sampling probability for each dataset + shuffle (bool): Whether to shuffle individual datasets. Only works with non-iterable datasets. + Defaults to True. + min_duration (int): the minimum duration (secs) of each synthetic code-switched sample. Will draw randomly until this is hit. + Defaults to 4 + max_duration (int): the maximum duration (secs) of each synthetic code-switched sample. + Defaults to 20 + min_monolingual (float): this percentage of the dataset will be original monolingual samples + Defaults to 0.3 - means 30% + db_norm (float): will normalise the composite CS sample to this DB level + Defaults to -25.0 + pause_start (int): inserts silence equal to this value (msecs) at the start of each CS sample + Defaults to 0 + pause_join (int): inserts silence equal to this value (msecs) between all language changes in the CS sample + Defaults to 0 + pause_end (int): terminates all CS samples with silence equal to this value (msecs) + Defaults to 0 + sampling_scales (list or float): gives you the ability to upsample/downsample each individual dataset + seed: Optional value to seed the numpy RNG. + global_rank (int): Worker rank, used for partitioning map style datasets. Defaults to 0. + world_size (int): Total number of processes, used for partitioning map style datasets. Defaults to 1. + pure_random (bool): If true, then always draw random sample from lang_probs. If false, you only draw from those datasets + which you haven't sampled from yet for the composite sample + force_monochannel (bool): If true, then all output audio will be mono-channel + infinity_mode (bool): If true, then the dataset iterable will generate an infinite amount of samples + sample_rate (int): the sample rate of all audio being sent to this Dataset + augmentor (AudioAugmentor): The any perturbations you wish to have applied on the CS samples + """ + + def __init__( + self, + datasets: List[Any], + lang_probs: Optional[List[float]] = None, + shuffle: bool = True, + min_duration: int = 4, + max_duration: int = 20, + min_monolingual: float = 0.3, + db_norm: float = -25.0, + pause_start: int = 0, + pause_join: int = 0, + pause_end: int = 0, + sampling_scales: Optional[Union[float, List[float]]] = None, + seed: Optional[int] = None, + global_rank: int = 0, + world_size: int = 1, + pure_random: bool = False, + force_monochannel: bool = True, + infinity_mode: bool = False, + sample_rate: int = 16000, + augmentor: Optional['AudioAugmentor'] = None, + ): + super().__init__() + + if len(datasets) == 0: + raise ValueError("CodeSwitchedDataset must receive a non-zero length datasets dict object") + + self.datasets = datasets + self.langs = list(range(len(datasets))) + self.langs_set = set(self.langs) + self.lang_iterables = {k: None for k in self.langs} + self.lang_kind = {k: None for k in self.langs} + self.shuffle = shuffle + self.min_duration = min_duration + self.max_duration = max_duration + self.min_monolingual = min_monolingual + self.db_norm = db_norm + self.pause_start = pause_start + self.pause_join = pause_join + self.pause_end = pause_end + self.pure_random = pure_random + self.force_monochannel = force_monochannel + self.infinity_mode = infinity_mode + self.global_rank = global_rank + self.world_size = world_size + self.augmentor = augmentor + self.sample_rate = sample_rate + self.length = 0 + if lang_probs is None: + self.prob_dict = {l: 1.0 / len(self.langs) for l in self.langs} + else: + assert len(self.langs) == len( + lang_probs + ), "Size mismatch between languages and respective probs in CodeSwitchedDataset" + self.prob_dict = {l: lang_probs[l] for l in self.langs} + self.lang_probs = np.array(list(self.prob_dict.values())) + if sampling_scales is not None and not isinstance(sampling_scales, list): + self.sampling_scales = {k: sampling_scales for k in self.langs} + elif ( + sampling_scales is not None + and isinstance(sampling_scales, list) + and len(sampling_scales) == len(self.langs) + ): + self.sampling_scales = {k: v for k, v in zip(self.langs, sampling_scales)} + else: + self.sampling_scales = {k: 1 for k in self.langs} + + for lang, dataset in enumerate(self.datasets): + isiterable = isinstance(dataset, IterableDataset) + + if isiterable: + self.lang_kind[lang] = 'iterable' + self.length += int(len(dataset) * self.sampling_scales[lang]) + else: + self.lang_kind[lang] = 'map' + self.length += int((len(dataset) // world_size) * self.sampling_scales[lang]) + + if seed is not None: + np.random.seed(seed) + + # set this to ensure compatibility with models searching for the collate_fn + # since this class stores datasets as a dict, not list + # self.collate_fn = self.datasets[self.langs[0]].collate_fn + if hasattr(self.datasets[self.langs[0]], 'collate_fn'): + self.collate_fn = self.datasets[self.langs[0]].collate_fn + elif ( + hasattr(self.datasets[self.langs[0]], 'datasets') + and isinstance(self.datasets[self.langs[0]].datasets, list) + and len(self.datasets[self.langs[0]].datasets) > 0 + and hasattr(self.datasets[self.langs[0]].datasets[0], 'collate_fn') + ): + # support datasets that are lists of entries + self.collate_fn = self.datasets[self.langs[0]].datasets[0].collate_fn + elif ( + hasattr(self.datasets[self.langs[0]], 'datasets') + and isinstance(self.datasets[self.langs[0]].datasets, list) + and len(self.datasets[self.langs[0]].datasets) > 0 + and hasattr(self.datasets[self.langs[0]].datasets[0], 'datasets') + and isinstance(self.datasets[self.langs[0]].datasets[0].datasets, list) + and len(self.datasets[self.langs[0]].datasets[0].datasets) > 0 + and hasattr(self.datasets[self.langs[0]].datasets[0].datasets[0], 'collate_fn') + ): + # support datasets that are lists of lists + self.collate_fn = self.datasets[self.langs[0]].datasets[0].datasets[0].collate_fn + else: + raise RuntimeError("CodeSwitchedDataset could not locate a valid dataset collate_fn to bind to") + + # this method returns an iterator object for a given language ID + # it correctly handles whether the underlying dataset is IterableDataset or mappable + def get_iterable_by_lang(self, lang): + dataset = self.datasets[lang] + + if isinstance(dataset, IterableDataset): + return dataset.__iter__() + else: + indices = np.arange(len(dataset)) + if self.shuffle: + np.random.shuffle(indices) + return iter(indices) + + # this method is the main function which builds and returns a composite, synthetic code-switched + # utterance on the fly. It automatically works with all of the class-based variables stored to create + # the synthetic utterance + def build_single_CS_sample(self): + # get_sample_from_language returns a LongTensor for the transcripts so we create a LongTensor to hold + # all returned transcripts + comp_text = torch.LongTensor([]) + created_sample_duration_sec = 0 + created_sample_langs = [] + created_sample_audios = [] + + # if min_monolingual fires, it means we will just return a single, original monolingual utterance + # from one of our languages based on that language's probability + pure_mono = np.random.rand() <= self.min_monolingual + + # we continue to add to the composite utterance until we hit the min_duration + while created_sample_duration_sec < self.min_duration: + # we sample from only those languages which haven't already been sampled for this particular + # synthetic utterance, unless pure_random=True, in which case, you just sample with replacement + # every time + if (self.pure_random and not pure_mono) or ( + len(set(created_sample_langs)) == 0 or len(set(created_sample_langs)) == len(self.langs) + ): + lang_id = np.random.choice(self.langs, p=self.lang_probs) + # elif pure_mono: + # use this approach if you want synthetic utterances which are all monolingual + # lang_id = created_sample_langs[0] + else: + # this code is for when we need to sample from only those languages which haven't been sampled + # yet for this utterance + p = np.array(list(map(self.prob_dict.get, list(self.langs_set - set(created_sample_langs))))) + p = p / p.sum() + lang_id = np.random.choice(list(self.langs_set - set(created_sample_langs)), p=p) + + audio, audio_len, labels, labels_len, *_ = self.get_sample_from_language(lang_id) + + # in case you get an audio which is all silence we keep sampling + if audio.count_nonzero().item() == 0: + continue + + sample_duration = len(audio) / self.sample_rate + if (created_sample_duration_sec + sample_duration) > self.max_duration: + continue + + if comp_text.device != labels.device: + comp_text = comp_text.to(labels.device) + + if audio.ndim > 1 and self.force_monochannel: + audio = audio.mean(dim=-1) + + created_sample_duration_sec += sample_duration + created_sample_langs.append(lang_id) + # need to use numpy instead of torch here because we need numpy's trim_zeros function + created_sample_audios.append(audio.cpu().numpy()) + comp_text = torch.cat([comp_text, labels], dim=0) + + # we want a real, non-synth pure_mono sample so we break soon as we have one + if pure_mono: + break + + # check that all samples have the same number of channels + sample_channels = list(set([s.ndim for s in created_sample_audios])) + if len(sample_channels) > 1: + raise RuntimeError( + "Mixture of audios with different number of channels in CodeSwitchedDataset. All sources must be same number of channels." + ) + + multichannel = sample_channels[0] > 1 + + # we start with pause_start amount of silence (zero array) which needs the correct shape for multi/mono channel + if multichannel: + comp_audio = np.zeros( + shape=(int(self.pause_start * self.sample_rate / 1000.0), created_sample_audios[0].shape[-1]), + dtype=created_sample_audios[0].dtype, + ) + else: + comp_audio = np.zeros( + shape=(int(self.pause_start * self.sample_rate / 1000.0),), dtype=created_sample_audios[0].dtype + ) + + # iterate over all mono-lingual samples to build the final composite + for idx, wav in enumerate(created_sample_audios): + if not multichannel: + # this function only works if mono-channel + wav = np.trim_zeros(wav) + + # normalise to provided DB level + wav_norm = wav * (10.0 ** (self.db_norm / 20.0) / np.maximum(0.01, (wav ** 2).mean(axis=0) ** 0.5)) + + # this part appends the normed waveform to the existing waveform, and inserts pause_join amount of silence + # if necessary, otherwise just a straight append + if idx < len(created_sample_audios) - 1: + if multichannel: + wav_norm = np.append( + wav_norm, + np.zeros( + shape=( + int(self.pause_join * self.sample_rate / 1000.0), + created_sample_audios[0].shape[-1], + ), + dtype=comp_audio.dtype, + ), + axis=0, + ) + else: + wav_norm = np.append( + wav_norm, + np.zeros(shape=(int(self.pause_join * self.sample_rate / 1000.0),), dtype=comp_audio.dtype), + axis=0, + ) + + # this is the penultimate composite wavform, just need to add pause_end silence + comp_audio = np.append(comp_audio, wav_norm, axis=0) + + # here we add the pause_end amount of silence, in correct channel shape + if multichannel: + comp_audio = np.append( + comp_audio, + np.zeros( + shape=(int(self.pause_end * self.sample_rate / 1000.0), created_sample_audios[0].shape[-1]), + dtype=comp_audio.dtype, + ), + axis=0, + ) + else: + comp_audio = np.append( + comp_audio, + np.zeros(shape=(int(self.pause_end * self.sample_rate / 1000.0),), dtype=comp_audio.dtype), + axis=0, + ) + + # we only want augmentation to happen on the final, synthetic utterance, and not on any of the individual + # languages, which is why we set augmentor=None when building the individual language datasets in audio_to_text_dataset.get_code_switched_dataset + # here we now apply augmentation to the final, synthetic utterance only + # all of this logic here happens in-memory, nothing is written to disk + if self.augmentor is not None: + # import here to avoid circular import error + # import here because otherwise CI test-nlp-imports fails since soundfile is only in requirements_asr and not in requirements_common + import soundfile as sf + + from nemo.collections.asr.parts.preprocessing import AudioSegment + + mb = io.BytesIO() + sf.write(mb, comp_audio, self.sample_rate, format='WAV') + mb.seek(0) + comp_audio_as = AudioSegment.from_file(mb, target_sr=self.sample_rate) + self.augmentor.perturb(comp_audio_as) + comp_audio = comp_audio_as.samples + + return ( + torch.tensor(comp_audio, dtype=audio.dtype, device=audio.device), + torch.tensor(len(comp_audio), device=audio_len.device).long(), + comp_text, + torch.tensor(len(comp_text), device=labels_len.device).long(), + ) + + # this is a helper method which prepares all of the iterator objects for all languages + # based on whether that language's underlying dataset is a map or an IterableDataset + def prep_underlying_datasets(self): + worker_info = pt_data.get_worker_info() + if worker_info is None: + max_elements = self.length + wid = 0 + wnum = 1 + else: + wid = worker_info.id + wnum = worker_info.num_workers + max_elements = len(range(wid, self.length, wnum)) + + for lang in self.langs: + if self.lang_kind[lang] == 'map': + start_idx = (len(self.datasets[lang]) // self.world_size) * self.global_rank + end_idx = start_idx + (len(self.datasets[lang]) // self.world_size) + if self.global_rank == self.world_size - 1: + end_idx = len(self.datasets[lang]) + indices = range(start_idx + wid, end_idx, wnum) + self.datasets[lang] = pt_data.Subset(self.datasets[lang], indices) + + self.lang_iterables[lang] = self.get_iterable_by_lang(lang) + + return max_elements + + # returns a sample (audio and transcript) from any underlying language stored by the class on instantiation + # the sample returned is a tensor for the audio and a tensor of ints for the transcript + # this method automatically handles StopIteration errors for the underyling language and rebuilds + # the iterator if necessary + def get_sample_from_language(self, lang): + while True: + try: + val = next(self.lang_iterables[lang]) + if self.lang_kind[lang] == 'map': + val = self.datasets[lang][val] + return val + except StopIteration: + self.lang_iterables[lang] = self.get_iterable_by_lang(lang) + + def __iter__(self): + # we create primed iterators for all languages and return the grand total of samples for each + # underlying language as a sum + max_elements = self.prep_underlying_datasets() + + if self.infinity_mode: + while True: + yield self.build_single_CS_sample() + else: + n = 0 + while n < max_elements: + yield self.build_single_CS_sample() + n += 1 + + def __len__(self): + return self.length diff --git a/nemo/collections/common/parts/preprocessing/collections.py b/nemo/collections/common/parts/preprocessing/collections.py index 4616f95e1a4f..ed9e53ae6ffe 100644 --- a/nemo/collections/common/parts/preprocessing/collections.py +++ b/nemo/collections/common/parts/preprocessing/collections.py @@ -159,6 +159,9 @@ def __init__( if hasattr(parser, "is_aggregate") and parser.is_aggregate and isinstance(text, str): if lang is not None: text_tokens = parser(text, lang) + # for future use if want to add language bypass to audio_to_text classes + # elif hasattr(parser, "lang") and parser.lang is not None: + # text_tokens = parser(text, parser.lang) else: raise ValueError("lang required in manifest when using aggregate tokenizers") else: diff --git a/scripts/asr_language_modeling/ngram_lm/create_lexicon_from_arpa.py b/scripts/asr_language_modeling/ngram_lm/create_lexicon_from_arpa.py index 22c657b25613..a38c33de05af 100644 --- a/scripts/asr_language_modeling/ngram_lm/create_lexicon_from_arpa.py +++ b/scripts/asr_language_modeling/ngram_lm/create_lexicon_from_arpa.py @@ -74,4 +74,6 @@ if tokenizer is None: f.write("{w}\t{s}\n".format(w=word, s=" ".join(word))) else: - f.write("{w}\t{s}\n".format(w=word, s=" ".join(tokenizer.text_to_tokens(word)))) + w_ids = tokenizer.text_to_ids(word) + if tokenizer.unk_id not in w_ids: + f.write("{w}\t{s}\n".format(w=word, s=" ".join(tokenizer.text_to_tokens(word)))) From e9b0b11835653020d685e198f74a8e04aab8d5ab Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Wed, 28 Jun 2023 09:26:40 -0700 Subject: [PATCH 068/123] Removed optional optimize_for_inference (#6933) Signed-off-by: Boris Fomitchev --- nemo/core/classes/exportable.py | 2 +- scripts/export.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/core/classes/exportable.py b/nemo/core/classes/exportable.py index 38b8e1c1e31b..3d2682f2304e 100644 --- a/nemo/core/classes/exportable.py +++ b/nemo/core/classes/exportable.py @@ -204,7 +204,7 @@ def _export( check_trace=check_trace, check_tolerance=check_tolerance, ) - jitted_model = torch.jit.optimize_for_inference(torch.jit.freeze(jitted_model)) + jitted_model = torch.jit.freeze(jitted_model) if verbose: logging.info(f"JIT code:\n{jitted_model.code}") jitted_model.save(output) diff --git a/scripts/export.py b/scripts/export.py index 80cbcf3dc666..fe3b79ebdf28 100644 --- a/scripts/export.py +++ b/scripts/export.py @@ -158,7 +158,7 @@ def nemo_export(argv): check_trace=check_trace, check_tolerance=args.check_tolerance, onnx_opset_version=args.onnx_opset, - verbose=args.verbose, + verbose=bool(args.verbose), ) except Exception as e: From 7e20750b2035a46dffde4fae389ac1f3967bf3fc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 28 Jun 2023 15:07:31 -0600 Subject: [PATCH 069/123] Apply garbage collection interval to validation steps (#6870) (#6872) * Apply garbage collection inverval to validation steps * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Sangkug Lym Co-authored-by: Sangkug Lym Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper --- .../language_modeling/megatron_base_model.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index ceddc1dca4d4..eb6671452992 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -163,6 +163,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): # The automatic garbage collector sould be disabled before training starts. if self.gc_interval > 0: gc.disable() + self.validation_global_step = 1 def _enable_nvidia_optimizations(self): "These optimizations are present in NVIDIA NGC PyTorch Containers" @@ -225,6 +226,16 @@ def on_train_start(self) -> None: super().on_train_start() self.init_global_step = self.trainer.global_step + def on_validation_start(self) -> None: + super().on_validation_start() + if self.gc_interval > 0: + gc.collect() + + def on_validation_end(self) -> None: + super().on_validation_end() + if self.gc_interval > 0: + gc.collect() + def _build_vocab(self): """ Manipulate vocabulary (e.g., pad vocabulary for increased performance)/ @@ -373,6 +384,14 @@ def on_train_batch_end(self, outputs, dataloader_iter: Any, batch_idx: int, unus if self.gc_interval > 0 and (self.trainer.global_step % self.gc_interval == 0): gc.collect() + def on_validation_batch_end(self, outputs, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + super().on_validation_batch_end(outputs, batch, batch_idx, dataloader_idx) + + if self.gc_interval > 0: + if self.validation_global_step % self.gc_interval == 0: + gc.collect() + self.validation_global_step += 1 + def setup_optimization( self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None, ): From 3b4f37af4626130fc4c9c5c09671a209d6e284c5 Mon Sep 17 00:00:00 2001 From: Sara Rabhi Date: Wed, 28 Jun 2023 17:19:01 -0400 Subject: [PATCH 070/123] Enable `rpe` methods in bert-like models (#6898) * enable rpe in bert model Signed-off-by: sararb * expose position_embedding_type to config Signed-off-by: sararb --------- Signed-off-by: sararb --- examples/nlp/language_modeling/conf/megatron_bert_config.yaml | 1 + .../nlp/models/language_modeling/megatron/bert_model.py | 2 ++ .../nlp/models/language_modeling/megatron_bert_model.py | 1 + 3 files changed, 4 insertions(+) diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml index a7e3364d41b4..4e53ded4a453 100644 --- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml @@ -50,6 +50,7 @@ model: # model architecture encoder_seq_length: 512 max_position_embeddings: ${.encoder_seq_length} + position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental. num_layers: 12 hidden_size: 768 ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size. diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert_model.py index 132f900298a6..cbbef2d56a15 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/bert_model.py @@ -188,6 +188,7 @@ def __init__( add_binary_head=True, megatron_legacy=False, sequence_parallel=False, + position_embedding_type='learned_absolute', ): super(BertModel, self).__init__() # args = get_args() @@ -234,6 +235,7 @@ def __init__( onnx_safe=onnx_safe, megatron_legacy=megatron_legacy, sequence_parallel=sequence_parallel, + position_embedding_type=position_embedding_type, ) self.initialize_word_embeddings( diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py index cac1a50e98ae..ab0459b2966c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py @@ -182,6 +182,7 @@ def model_provider_func(self, pre_process, post_process): add_binary_head=cfg.bert_binary_head, megatron_legacy=cfg.get('megatron_legacy', False), sequence_parallel=self.cfg.get('sequence_parallel', False), + position_embedding_type=self.cfg.get("position_embedding_type", "learned_absolute"), ) return model From 69747d8ce90595d4d6826c01c7c6dc034f5c8265 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Thu, 29 Jun 2023 01:41:52 +0400 Subject: [PATCH 071/123] Fix AN4 dataset links (#6926) * Fix an4 dataset link in docs Signed-off-by: Vladimir Bataev * Remove broken a4 dataset links from tutorials Signed-off-by: Vladimir Bataev --------- Signed-off-by: Vladimir Bataev --- docs/source/asr/datasets.rst | 2 +- tutorials/asr/ASR_for_telephony_speech.ipynb | 2 +- tutorials/asr/ASR_with_NeMo.ipynb | 2 +- tutorials/asr/ASR_with_Subword_Tokenization.ipynb | 2 +- tutorials/asr/ASR_with_Transducers.ipynb | 2 +- tutorials/asr/Online_Noise_Augmentation.ipynb | 2 +- tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb | 2 +- .../speaker_tasks/Speaker_Identification_Verification.ipynb | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst index 617d5195005f..05278ecb2437 100644 --- a/docs/source/asr/datasets.rst +++ b/docs/source/asr/datasets.rst @@ -126,7 +126,7 @@ AN4 Dataset This is a small dataset recorded and distributed by Carnegie Mellon University. It consists of recordings of people spelling out addresses, names, etc. Information about this dataset can be found on the `official CMU site `_. -#. `Download and extract the dataset `_ (which is labeled "NIST's Sphere audio (.sph) format (64M)". +#. `Download and extract the dataset `_ (which is labeled "NIST's Sphere audio (.sph) format (64M)". #. Convert the ``.sph`` files to ``.wav`` using sox, and build one training and one test manifest. diff --git a/tutorials/asr/ASR_for_telephony_speech.ipynb b/tutorials/asr/ASR_for_telephony_speech.ipynb index 11ba4b85bd47..6133fdc9a8b9 100644 --- a/tutorials/asr/ASR_for_telephony_speech.ipynb +++ b/tutorials/asr/ASR_for_telephony_speech.ipynb @@ -103,7 +103,7 @@ "# Download the dataset. This will take a few moments...\n", "print(\"******\")\n", "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n", - " an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz' # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz \n", + " an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'\n", " an4_path = wget.download(an4_url, data_dir)\n", " print(f\"Dataset downloaded at: {an4_path}\")\n", "else:\n", diff --git a/tutorials/asr/ASR_with_NeMo.ipynb b/tutorials/asr/ASR_with_NeMo.ipynb index 0c0d239bf58c..74cd0f739e84 100644 --- a/tutorials/asr/ASR_with_NeMo.ipynb +++ b/tutorials/asr/ASR_with_NeMo.ipynb @@ -189,7 +189,7 @@ "# Download the dataset. This will take a few moments...\n", "print(\"******\")\n", "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n", - " an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz' # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz \n", + " an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'\n", " an4_path = wget.download(an4_url, data_dir)\n", " print(f\"Dataset downloaded at: {an4_path}\")\n", "else:\n", diff --git a/tutorials/asr/ASR_with_Subword_Tokenization.ipynb b/tutorials/asr/ASR_with_Subword_Tokenization.ipynb index b932916f2bc5..cdb36251fb70 100644 --- a/tutorials/asr/ASR_with_Subword_Tokenization.ipynb +++ b/tutorials/asr/ASR_with_Subword_Tokenization.ipynb @@ -372,7 +372,7 @@ "# Download the dataset. This will take a few moments...\r\n", "print(\"******\")\r\n", "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\r\n", - " an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz' # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz \r\n", + " an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'\r\n", " an4_path = wget.download(an4_url, data_dir)\r\n", " print(f\"Dataset downloaded at: {an4_path}\")\r\n", "else:\r\n", diff --git a/tutorials/asr/ASR_with_Transducers.ipynb b/tutorials/asr/ASR_with_Transducers.ipynb index e6bccc3f0f42..e1eb494f777e 100644 --- a/tutorials/asr/ASR_with_Transducers.ipynb +++ b/tutorials/asr/ASR_with_Transducers.ipynb @@ -137,7 +137,7 @@ "# Download the dataset. This will take a few moments...\n", "print(\"******\")\n", "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n", - " an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz' # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz \n", + " an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'\n", " an4_path = wget.download(an4_url, data_dir)\n", " print(f\"Dataset downloaded at: {an4_path}\")\n", "else:\n", diff --git a/tutorials/asr/Online_Noise_Augmentation.ipynb b/tutorials/asr/Online_Noise_Augmentation.ipynb index f8741cdcbfe1..8883cce55a80 100644 --- a/tutorials/asr/Online_Noise_Augmentation.ipynb +++ b/tutorials/asr/Online_Noise_Augmentation.ipynb @@ -135,7 +135,7 @@ "# Download the dataset. This will take a few moments...\n", "print(\"******\")\n", "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n", - " an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz' # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz \n", + " an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'\n", " an4_path = wget.download(an4_url, data_dir)\n", " print(f\"Dataset downloaded at: {an4_path}\")\n", "else:\n", diff --git a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb index 62481c3762d2..c9c547a8383e 100644 --- a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb +++ b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb @@ -190,7 +190,7 @@ "# Download the dataset. This will take a few moments...\n", "print(\"******\")\n", "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n", - " an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz' # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz \n", + " an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'\n", " an4_path = wget.download(an4_url, data_dir)\n", " print(f\"Dataset downloaded at: {an4_path}\")\n", "else:\n", diff --git a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb index 8e3ae9c1f131..dce8c46df1b0 100644 --- a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb +++ b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb @@ -85,7 +85,7 @@ "# Download the dataset. This will take a few moments...\n", "print(\"******\")\n", "if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):\n", - " an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz' # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz \n", + " an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'\n", " an4_path = wget.download(an4_url, data_dir)\n", " print(f\"Dataset downloaded at: {an4_path}\")\n", "else:\n", From 295e88fcb171ab6ce7264f9ed6f022ed13db8d8f Mon Sep 17 00:00:00 2001 From: Abhinav Khattar Date: Wed, 28 Jun 2023 15:13:28 -0700 Subject: [PATCH 072/123] Update core commit for CI (#6939) * Update core commit for CI Signed-off-by: Abhinav Khattar * add comment Signed-off-by: Abhinav Khattar --------- Signed-off-by: Abhinav Khattar --- Jenkinsfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 8a151d34c336..caf48f7e1624 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -59,9 +59,10 @@ pipeline { stage('Megatron Core installation') { steps { + // commit points to core 23.05 ToT sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout d2891b4ad3a00e3c4223f89491afd9e1b812f9b5 && \ + git checkout 060415572f4365a2e895f8036c4e37dad0efbdf5 && \ pip install -e .' } } From 29b9b8a719300bf675f94d00092c12bad2d4c433 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 28 Jun 2023 16:14:54 -0600 Subject: [PATCH 073/123] Tensor-parallel communication overlap with userbuffer backend (#6792) * Tensor-parallel communication overlap with userbuffer backend (#6780) * add interfaces for tp_communication overlap [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Interface to provide custom userbuffer communicator settings by yaml file [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Construct MPI process group for userbuffers support Signed-off-by: Tim Moon --------- Signed-off-by: Tim Moon Co-authored-by: Tim Moon Co-authored-by: Abhinav Khattar * codeql change Signed-off-by: Abhinav Khattar --------- Signed-off-by: Tim Moon Signed-off-by: ericharper Signed-off-by: Abhinav Khattar Co-authored-by: Sangkug Lym Co-authored-by: Tim Moon Co-authored-by: Abhinav Khattar Co-authored-by: ericharper --- .../conf/megatron_gpt_config.yaml | 7 +++++ .../language_modeling/megatron/gpt_model.py | 2 ++ .../language_modeling/megatron_base_model.py | 9 ++++++ .../language_modeling/megatron_gpt_model.py | 29 +++++++++++++++++++ .../modules/common/megatron/language_model.py | 4 +++ .../modules/common/megatron/megatron_init.py | 2 ++ .../modules/common/megatron/transformer.py | 4 +++ nemo/collections/nlp/parts/nlp_overrides.py | 4 +++ nemo/utils/app_state.py | 17 +++++++++++ 9 files changed, 78 insertions(+) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 8d7fd09e4307..e588e94a6720 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -166,6 +166,13 @@ model: fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. + ub_tp_comm_overlap: False + # Use userbuffer backend to overlap tensor-parallel communications with computes. + # This feature is only available with Transformer Engine and squence parallelism enabled and, currently, supports only GPT models. + ub_tp_comm_overlap_cfg: null + # A yaml file with userbuffer communicator configurations. This file should provide `method`, `dtype`, `num_sm`, `num_splits`, + # `cga_size`, `num_splits`, `set_sm_margin`, and `aggregate` for the communicators to use custom settings. + # If the configuration file is not provided a default setting is used for all communicators. ## Flash Attention use_flash_attention: False # Use flash attention in self-attention module, this config does nothing when transformer_engine=True diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py index b32bfdb09f20..8e28b6cab362 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py @@ -164,6 +164,7 @@ def __init__( fp8_amax_compute_algo='most_recent', reduce_amax=True, use_emha=False, + ub_tp_comm_overlap=False, use_flash_attention=False, ): super(GPTModel, self).__init__(share_token_embeddings=share_embeddings_and_output_weights) @@ -246,6 +247,7 @@ def __init__( fp8_amax_compute_algo=fp8_amax_compute_algo, reduce_amax=reduce_amax, use_emha=use_emha, + ub_tp_comm_overlap=ub_tp_comm_overlap, use_flash_attention=use_flash_attention, ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index eb6671452992..a54c68866d26 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -131,6 +131,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): global_batch_size=cfg.get('global_batch_size'), rampup_batch_size=cfg.get('rampup_batch_size'), use_fp8=cfg.get('fp8', False), + init_mpi_proc_group=cfg.get('ub_tp_comm_overlap', False), seed=self.cfg.get('seed', 1234), apex_transformer_log_level=self.cfg.get('apex_transformer_log_level', 30), ) @@ -578,6 +579,14 @@ def _validate_and_override_config(self): 'Make sure the number of model chunks is the same across all pipeline stages.' ) + if self.cfg.get('ub_tp_comm_overlap', False): + if not self.cfg.get('transformer_engine', False) or not self.cfg.get('sequence_parallel', False): + logging.info( + "Userbuffer tensor-parallel communication overlap is available with both Transformer Engine and sequence-parallelism." + ) + with open_dict(self.cfg): + self.cfg.ub_tp_comm_overlap = False + def is_data_parallel_rank_zero(self): if is_global_rank_zero(): return True diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 84caed6c111e..817ef0bd6442 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -85,6 +85,7 @@ try: import transformer_engine + from transformer_engine.pytorch import module as te_module HAVE_TE = True @@ -282,6 +283,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self._nsys_profile_end_step *= grad_accum_steps self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', True) + self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False) def get_gpt_module_list(self): if isinstance(self.model, list): @@ -357,6 +359,7 @@ def model_provider_func(self, pre_process, post_process): fp8_amax_compute_algo=self.cfg.get('fp8_amax_compute_algo', 'most_recent'), reduce_amax=self.cfg.get('reduce_amax', True), use_emha=self.cfg.get('use_emha', False), + ub_tp_comm_overlap=self.cfg.get('ub_tp_comm_overlap', False), use_flash_attention=self.cfg.get('use_flash_attention', False), megatron_legacy=self.cfg.get('megatron_legacy', False), ) @@ -515,6 +518,32 @@ def training_step(self, dataloader_iter, batch_idx): The input batch to each micro-batch is fetched using the dataloader function in the micro-batch fwd function. """ + # Initialize userbuffer communicators. Initialization is done only once at the + # beginning of the first training step. + if self.initialize_ub: + input_shape = [ + self.cfg.get('encoder_seq_length') * self.cfg.get('micro_batch_size'), + self.cfg.get('hidden_size'), + ] + ub_cfg_file_name = self.cfg.get('ub_tp_comm_overlap_cfg', None) + ub_cfgs = None + if ub_cfg_file_name is not None: + try: + import yaml + + with open(ub_cfg_file_name, 'r') as ub_cfg_file: + ub_cfgs = yaml.safe_load(ub_cfg_file) + except (ImportError, TypeError): + print("Fail to read ub_tp_comm_overlap config file.") + + te_module.initialize_ub( + shape=input_shape, + tp_size=self.cfg.get('tensor_model_parallel_size'), + use_fp8=self.cfg.get('fp8'), + ub_cfgs=ub_cfgs, + ) + self.initialize_ub = False + if self.rampup_batch_size: num_microbatch_calculator = apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR current_global_batch_size = num_microbatch_calculator.current_global_batch_size diff --git a/nemo/collections/nlp/modules/common/megatron/language_model.py b/nemo/collections/nlp/modules/common/megatron/language_model.py index e6305e563549..683163246379 100755 --- a/nemo/collections/nlp/modules/common/megatron/language_model.py +++ b/nemo/collections/nlp/modules/common/megatron/language_model.py @@ -121,6 +121,7 @@ def get_language_model( fp8_amax_compute_algo='most_recent', reduce_amax=True, use_emha=False, + ub_tp_comm_overlap=False, use_flash_attention=False, ): """Build language model and return along with the key to save.""" @@ -197,6 +198,7 @@ def get_language_model( fp8_amax_compute_algo=fp8_amax_compute_algo, reduce_amax=reduce_amax, use_emha=use_emha, + ub_tp_comm_overlap=ub_tp_comm_overlap, use_flash_attention=use_flash_attention, ) # key used for checkpoints. @@ -504,6 +506,7 @@ def __init__( fp8_amax_compute_algo='most_recent', reduce_amax=True, use_emha=False, + ub_tp_comm_overlap=False, use_flash_attention=False, ): super(TransformerLanguageModel, self).__init__(share_token_embeddings=share_embeddings_and_output_weights) @@ -643,6 +646,7 @@ def __init__( fp8_amax_compute_algo=fp8_amax_compute_algo, reduce_amax=reduce_amax, use_emha=use_emha, + ub_tp_comm_overlap=ub_tp_comm_overlap, position_embedding_type=position_embedding_type, use_flash_attention=use_flash_attention, ) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index e0551fad5d16..7431bffad26c 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -67,6 +67,7 @@ def initialize_model_parallel_for_nemo( global_batch_size=None, rampup_batch_size=None, use_fp8=False, + init_mpi_proc_group=False, seed=1234, apex_transformer_log_level=30, ): @@ -83,6 +84,7 @@ def initialize_model_parallel_for_nemo( app_state.pipeline_model_parallel_size = pipeline_model_parallel_size app_state.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size app_state.use_fp8 = use_fp8 + app_state.init_mpi_proc_group = init_mpi_proc_group ( app_state.tensor_model_parallel_rank, app_state.pipeline_model_parallel_rank, diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index 258e42ce9694..652a3e6f4e3a 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -814,6 +814,7 @@ def __init__( layer_type: str = "encoder", drop_path_rate: float = 0, use_emha: bool = False, + ub_tp_comm_overlap: bool = False, autocast_dtype: Any = 16, zero_centered_gamma: bool = False, ) -> None: @@ -846,6 +847,7 @@ def __init__( set_parallel_mode=tp_size > 1, fuse_qkv_params=True, zero_centered_gamma=zero_centered_gamma, + ub_tp_comm_overlap=ub_tp_comm_overlap, ) # use_emha=use_emha, @@ -941,6 +943,7 @@ def __init__( fp8_amax_compute_algo='most_recent', reduce_amax=True, use_emha=False, + ub_tp_comm_overlap=False, normalize_attention_scores=True, multi_query_attention=False, num_moe_experts=1, @@ -1084,6 +1087,7 @@ def build_layer(layer_number): apply_residual_connection_post_layernorm=False, autocast_dtype=precision, use_emha=use_emha, + ub_tp_comm_overlap=ub_tp_comm_overlap, zero_centered_gamma=normalization == 'layernorm1p', ) else: diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index c390ba995843..3b5a14c52f37 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -181,6 +181,10 @@ def init_model_parallel(self, global_rank: int, world_size: int) -> None: app_state.data_parallel_size = parallel_state.get_data_parallel_world_size() app_state.pipeline_model_parallel_group = parallel_state.get_pipeline_model_parallel_group() + # create MPI process group for UCX-based communication APIs + if app_state.init_mpi_proc_group: + torch.distributed.new_group(backend='mpi') + def save_checkpoint( self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None ) -> None: diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py index c3ead0bff48f..d06e1ac32e36 100644 --- a/nemo/utils/app_state.py +++ b/nemo/utils/app_state.py @@ -55,6 +55,7 @@ def __init__(self): self._data_parallel_group = None self._megatron_checkpoint_version = None self._use_fp8 = False + self._init_mpi_proc_gruop = False self._random_seed = None @@ -363,6 +364,22 @@ def use_fp8(self, use_fp8): """ self._use_fp8 = use_fp8 + @property + def init_mpi_proc_group(self): + """ Property sets the initialization of mpi process group. + Returns: + Initialize mpi process group. + """ + return self._init_mpi_proc_group + + @init_mpi_proc_group.setter + def init_mpi_proc_group(self, init_mpi_proc_group): + """ Property sets the initialization of mpi process group. + Args: + init_mpi_proc_group: Initialize mpi process group. + """ + self._init_mpi_proc_group = init_mpi_proc_group + @property def random_seed(self): """ Property returns the random seed. From 5260c9c2f7202770fcc8f1a3fbf30fb07658918c Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Wed, 28 Jun 2023 16:25:46 -0700 Subject: [PATCH 074/123] lora inference ci (#6931) * inference test Signed-off-by: arendu * Update typo Signed-off-by: Adi Renduchintala --------- Signed-off-by: arendu Signed-off-by: Adi Renduchintala Co-authored-by: Abhinav Khattar --- Jenkinsfile | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index caf48f7e1624..1a79d87bcd38 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -3778,7 +3778,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' } failFast true steps { - sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results" + sh "rm -rf /home/TestData/nlp/lora_tuning_tp2" sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \ trainer.devices=2 \ trainer.log_every_n_steps=1 \ @@ -3787,7 +3787,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' trainer.val_check_interval=3 \ ++trainer.limit_val_batches=2 \ trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results \ + exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \ model.pipeline_model_parallel_size=1 \ model.tensor_model_parallel_size=2 \ model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ @@ -3801,7 +3801,21 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' model.data.validation_ds.num_workers=0 \ model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ model.data.validation_ds.names=[quarel]" - sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results" + sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_tuning/checkpoints/megatron_gpt_peft_tuning.nemo \ + model.peft.restore_from_ckpt_name=null \ + model.peft.restore_from_hparams_path=null \ + trainer.devices=2 \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ + model.data.test_ds.names=['quarel4'] \ + model.data.test_ds.global_batch_size=1 \ + model.data.test_ds.micro_batch_size=1 \ + model.data.test_ds.tokens_to_generate=10 \ + inference.greedy=True \ + inference.repetition_penalty=1.0 \ + inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'" + sh "rm -rf /home/TestData/nlp/lora_tuning_tp2" } } stage('L2: Megatron GPT Eval') { From a27ba52a673789c88fc3382d4ea53f9d2d9bd131 Mon Sep 17 00:00:00 2001 From: Yi Dong <43824965+yidong72@users.noreply.github.com> Date: Wed, 28 Jun 2023 22:30:46 -0400 Subject: [PATCH 075/123] support value attribution condition (#6934) * text gen condition on value Signed-off-by: Yi Dong * fix round function Signed-off-by: Yi Dong * predict value Signed-off-by: Yi Dong * scale 9 Signed-off-by: Yi Dong * handle hard code label Signed-off-by: Yi Dong * use likert scale 7 Signed-off-by: Yi Dong * scale 6 Signed-off-by: Yi Dong * merge the latest main Signed-off-by: Yi Dong * added latest chatbot ui Signed-off-by: Yi Dong * added new playground interface Signed-off-by: Yi Dong * default scale 9 Signed-off-by: Yi Dong * address comments Signed-off-by: Yi Dong * add speicial tokens Signed-off-by: Yi Dong * handles more tokenizer Signed-off-by: Yi Dong * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added comments Signed-off-by: Yi Dong * fix type Signed-off-by: Yi Dong * faster check Signed-off-by: Yi Dong --------- Signed-off-by: Yi Dong Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../conf/megatron_gpt_inference.yaml | 7 +- .../language_modeling/megatron_gpt_eval.py | 8 +- .../megatron/gpt_sft_chat_dataset.py | 150 +++++-- .../language_modeling/megatron_base_model.py | 5 + .../nlp/modules/common/megatron_web_server.py | 380 +++++++++++++----- 5 files changed, 409 insertions(+), 141 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml index 6bd1be905a97..53d4e9b7e82b 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml @@ -35,4 +35,9 @@ share: False # whether create a public URL username: test # user name for web client password: test2 # password for web client web_port: 9889 # the port number of the web server -chat: False # use the chat interface \ No newline at end of file +chat: False # use the chat interface +chatbot_config: + value: False # whether to inject the value attributes + user: User + assistant: Assistant + system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n" diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index af1657b44d7b..dc299fccdf90 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -15,6 +15,7 @@ import asyncio import os import threading +from functools import partial import torch from omegaconf import OmegaConf, open_dict @@ -301,7 +302,12 @@ def main(cfg) -> None: if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0: if cfg.web_server: if cfg.chat: - web_ui = get_chatbot_demo + defaults = { + 'user': cfg.chatbot_config.user, + 'assistant': cfg.chatbot_config.assistant, + 'system': cfg.chatbot_config.system, + } + web_ui = partial(get_chatbot_demo, defaults=defaults, value=cfg.chatbot_config.value) else: web_ui = get_demo loop = asyncio.new_event_loop() diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py index 2c896c2e61af..d6c2257ebabb 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py @@ -16,6 +16,7 @@ import torch +from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset from nemo.utils import logging @@ -29,25 +30,65 @@ SYSTEM_TOKEN = "System\n" TURN_TOKEN = "" -GUARD_RAIL_INSTRUCTION = { - "TEXT_TO_CANONICAL_FORM": "Given a dialogue, for each turn you need to generate a short summary called a canonical form. Generate the canonical form for the last turn in the dialogue.", - "CANONICAL_FORM_TO_TEXT": "Given a dialogue, for each turn we also have a short summary called a canonical form. Generate the canonical form given the last turn message and canonical form. Then generate the message.", +TYPE_INSTRUCTION = { + 'TEXT_TO_VALUE': "", + 'VALUE_TO_TEXT': '', } -def _mask_targets(target, tokenized_lens, speakers, header_len, s_ids, tokenizer, mask_role): +def _mask_targets( + target, + tokenized_lens, + speakers, + header_len, + s_ids, + tokenizer, + mask_role, + gtype, + extra_id_2_token_id, + new_line_token_id, +): + """ This function masks the tokens so the loss is computed only on the non-masked role's responses. + For 'TEXT_TO_VALUE' type, the loss is computed on the value attributes. + + Args: + target (Tensor): input ids + tokenized_lens (List[int]): array of lengths of each turns + speakers (List[str]): array of speakers of each turns + header_len (int): the system prompt length + s_ids (List[Tensor]): array of tokenized ids of each turns + tokenizer (TokenizerSpec): tokenizer object + mask_role (str): the speaker id to be masked from loss computation + gtype (str): either 'TEXT_TO_VALUE' or 'VALUE_TO_TEXT' + extra_id_2_token_id (int): token id + new_line_token_id (int): new line token id + + """ cur_idx = header_len tgt_len = target.shape[0] for i, (tokenized_len, speaker, s_id) in enumerate(zip(tokenized_lens, speakers, s_ids)): - # note, sentence piece will add extra empty token in front. s_id has that extra token too - skip_name_len = len(tokenizer.text_to_ids(TURN_TOKEN + speaker + END_NAME_SIGNAL)) + # note, sentence piece will add extra empty token in front. has to compute the diff + id1 = tokenizer.text_to_ids("") + id2 = tokenizer.text_to_ids("" + TURN_TOKEN + speaker + END_NAME_SIGNAL) + skip_name_len = len(id2) - len(id1) + if extra_id_2_token_id is None: + raise ValueError("extra_id_2 is not in the vocabulary") + if (s_id == extra_id_2_token_id).any().item(): + if gtype == 'VALUE_TO_TEXT': + # if contains the token + assert skip_name_len == torch.where((s_id == extra_id_2_token_id))[0].item() + # find new line token id 14 + more_skip_len = torch.where((s_id[skip_name_len:] == new_line_token_id))[0][0].item() + 1 + skip_name_len += more_skip_len + elif gtype == 'TEXT_TO_VALUE': + skip_name_len = torch.where((s_id == extra_id_2_token_id))[0].item() + 1 if cur_idx >= tgt_len: break elif cur_idx + tokenized_len < tgt_len: # Check whether the mask is applied to the correct position, the first token is turn token: # s_id[2:] skips the artifact empty token and the turn token # target[cur_idx + 1:cur_idx + tokenized_len] skip the turn token - if not torch.equal(target[cur_idx + 1 : cur_idx + tokenized_len], s_id[2:]): + if not torch.equal(target[cur_idx + 1 : cur_idx + tokenized_len], s_id[1:]): logging.warning("a sentence mismatches the corresponding piece " "in the conversation") if i == 0: # mask the first turn completely to provide at least one turn as context @@ -57,7 +98,7 @@ def _mask_targets(target, tokenized_lens, speakers, header_len, s_ids, tokenizer target[cur_idx + 1 : cur_idx + tokenized_len] = IGNORE_INDEX else: # mask up to the name end, need to remove one as skip name has an extra artifact empty token - target[cur_idx : cur_idx + skip_name_len - 1] = IGNORE_INDEX + target[cur_idx : cur_idx + skip_name_len] = IGNORE_INDEX cur_idx += tokenized_len @@ -65,6 +106,13 @@ def cannonical_form_formater(cannoical_form): return f'{cannoical_form}\n' +def response_value_formater(label): + if isinstance(label, str): + return '' + label + '\n' + else: + raise ValueError(f'Unknown label type {type(label)}, only str type is supported') + + def _add_speaker_and_signal(header, source, mask_role, gtype): """Add speaker and start/end signal on each round.""" BEGIN_SIGNAL = "" @@ -76,28 +124,30 @@ def _add_speaker_and_signal(header, source, mask_role, gtype): sentence["value"] = ( BEGIN_SIGNAL + role_token + sentence_from + END_NAME_SIGNAL + sentence["value"] + END_SIGNAL ) - elif gtype == "TEXT_TO_CANONICAL_FORM": + elif gtype == "VALUE_TO_TEXT": sentence["value"] = ( BEGIN_SIGNAL + role_token + sentence_from + END_NAME_SIGNAL + + (response_value_formater(sentence['label']) if 'label' in sentence else '') + sentence["value"] + END_SIGNAL - + cannonical_form_formater(sentence['canonical_form']) ) - elif gtype == "CANONICAL_FORM_TO_TEXT": + elif gtype == "TEXT_TO_VALUE": sentence["value"] = ( BEGIN_SIGNAL + role_token + sentence_from + END_NAME_SIGNAL - + cannonical_form_formater(sentence['canonical_form']) + sentence["value"] + END_SIGNAL + + (response_value_formater(sentence['label']) if 'label' in sentence else '') ) else: - raise ValueError(f"source type {gtype} not supported") + raise ValueError( + f"source type {gtype} not supported, only 'VALUE_TO_TEXT' and 'TEXT_TO_VALUE' are supported" + ) conversation += sentence["value"] # if the last turn is not masked, add next token start token to the end, which will be included for loss calculation if sentence_from != mask_role and i == len(source) - 1: @@ -105,9 +155,7 @@ def _add_speaker_and_signal(header, source, mask_role, gtype): return conversation -def preprocess( - source: dict, tokenizer: TokenizerSpec, -): +def preprocess(source: dict, tokenizer: TokenizerSpec, extra_id_2_token_id: int, new_line_token_id: int): """ Given a conversation list. This transform: 1. Add signal '### ' at the beginning each sentence, with end signal '\n'; @@ -115,17 +163,18 @@ def preprocess( 3. Tokenize the concatenated conversation; 4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX. """ - canonical_type = None + data_type = None if 'type' in source: - canonical_type = source['type'] - assert canonical_type in GUARD_RAIL_INSTRUCTION, f"source type {canonical_type} not supported" + data_type = source['type'] + assert data_type in TYPE_INSTRUCTION, f"source type {data_type} not supported" # add end signal and concatenate together conversation = source['system'] - if canonical_type is not None: - conversation = conversation + '\n' + GUARD_RAIL_INSTRUCTION[canonical_type] + if data_type is not None: + if TYPE_INSTRUCTION[data_type] != '': + conversation = conversation + '\n' + TYPE_INSTRUCTION[data_type] mask_role = source.get('mask', 'User') - header = f"{SYSTEM_TOKEN}{conversation}\n\n" - conversation = _add_speaker_and_signal(header, source['conversations'], mask_role, canonical_type) + header = f"{SYSTEM_TOKEN}{conversation}" + conversation = _add_speaker_and_signal(header, source['conversations'], mask_role, data_type) # tokenize conversations input_ids = tokenizer.text_to_ids(conversation) target = copy.deepcopy(input_ids) @@ -134,10 +183,16 @@ def preprocess( ids = [] tokenized_lens = [] for s in source['conversations']: - tokenized_sentence = tokenizer.text_to_ids(s["value"]) - ids.append(torch.tensor(tokenized_sentence)) - # remove one token as it adds an empty token in front - tokenized_lens.append(len(tokenized_sentence) - 1) + if isinstance(tokenizer, SentencePieceTokenizer): + tokenized_sentence = tokenizer.text_to_ids(s["value"]) + ids.append(torch.tensor(tokenized_sentence)[1:]) + # remove one token as it adds an empty token in front + tokenized_lens.append(len(tokenized_sentence) - 1) + else: + tokenized_sentence = tokenizer.text_to_ids(s["value"]) + ids.append(torch.tensor(tokenized_sentence)) + # remove one token as it adds an empty token in front + tokenized_lens.append(len(tokenized_sentence)) speakers = [sentence["from"] for sentence in source['conversations']] assert mask_role in speakers, "mask role not in the conversation" target = torch.LongTensor(target) @@ -145,18 +200,51 @@ def preprocess( target[:header_len] = IGNORE_INDEX input_ids = torch.LongTensor(input_ids) - _mask_targets(target, tokenized_lens, speakers, header_len, ids, tokenizer, mask_role) + _mask_targets( + target, + tokenized_lens, + speakers, + header_len, + ids, + tokenizer, + mask_role, + data_type, + extra_id_2_token_id, + new_line_token_id, + ) mask = (target != IGNORE_INDEX).bool() assert mask.sum().item() != 0, "mask is empty" return dict(input_ids=input_ids, mask=mask) +def _check_token_in_vocab(tokenizer, token): + ids = tokenizer.text_to_ids(token) + if isinstance(tokenizer, SentencePieceTokenizer): + return len(ids) == 2 + else: + return len(ids) == 1 + + class GPTSFTChatDataset(GPTSFTDataset): def _build_samples_mapping(self): super()._build_samples_mapping() assert hasattr(self.tokenizer, "vocab"), "tokenizer should have vocab property, not supported" - assert '' in self.tokenizer.vocab, " not in the tokenizer vocab. not supported" - assert '' in self.tokenizer.vocab, " not in the tokenizer vocab. not supported" + assert _check_token_in_vocab( + self.tokenizer, '' + ), " not in the tokenizer vocab. not supported" + assert _check_token_in_vocab( + self.tokenizer, '' + ), " not in the tokenizer vocab. not supported" + # calcuilate id value + if _check_token_in_vocab(self.tokenizer, ''): + ids_1 = self.tokenizer.text_to_ids('') + ids_2 = self.tokenizer.text_to_ids('') + self.extra_id_2_token_id = ids_1[len(ids_2) :][0] + else: + self.extra_id_2_token_id = None + ids_1 = self.tokenizer.text_to_ids('\n') + ids_2 = self.tokenizer.text_to_ids('') + self.new_line_token_id = ids_1[len(ids_2) :][0] def _process_example(self, example): """ @@ -164,7 +252,7 @@ def _process_example(self, example): Truncation is carried out when needed, but it is performed only on the prompt side. BOS, EOS, and SEP, are added if specified. """ - result = preprocess(example, self.tokenizer) + result = preprocess(example, self.tokenizer, self.extra_id_2_token_id, self.new_line_token_id) return result diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index a54c68866d26..e018a4decaf6 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -17,6 +17,7 @@ import re from typing import Any, Dict, Optional, Union +import omegaconf import torch from omegaconf import open_dict from omegaconf.dictconfig import DictConfig @@ -223,6 +224,10 @@ def _build_tokenizer(self): legacy=legacy, ) + if self._cfg.tokenizer.get('additional_special_tokens', None) is not None: + tokens_list = omegaconf.OmegaConf.to_object(self._cfg.tokenizer.additional_special_tokens) + self.tokenizer.add_special_tokens({'additional_special_tokens': tokens_list}) + def on_train_start(self) -> None: super().on_train_start() self.init_global_step = self.trainer.global_step diff --git a/nemo/collections/nlp/modules/common/megatron_web_server.py b/nemo/collections/nlp/modules/common/megatron_web_server.py index d3ccde49a5c5..648bca024ba0 100644 --- a/nemo/collections/nlp/modules/common/megatron_web_server.py +++ b/nemo/collections/nlp/modules/common/megatron_web_server.py @@ -32,8 +32,33 @@ TURN_TOKEN = '' -DEFAULT_SYSTEM = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n" -SYSTEM_TOKEN = 'System\n' +PROMPT_PRESETS = { + "DIALOGUE": { + "SYSTEM_TURN_TOKEN": '', + "USER_TURN_TOKEN": '', + "BOT_TURN_TOKEN": '', + "END_OF_NAME": '', + "END_OF_TURN": '\n', + }, + "DIALOGUE2": { + "SYSTEM_TURN_TOKEN": 'System\n', + "USER_TURN_TOKEN": '', + "BOT_TURN_TOKEN": '', + "END_OF_NAME": '\n', + "END_OF_TURN": '\n', + }, +} + + +PRESETS = { + "K1-Greedy": {"temperature": 1.0, "top_p": 0.9, "top_k": 1, "repetition_penalty": 1.0,}, + "K50": {"temperature": 0.75, "top_p": 0.95, "top_k": 50, "repetition_penalty": 1.0,}, + "K50-Creative": {"temperature": 0.85, "top_p": 0.95, "top_k": 50, "repetition_penalty": 1.0,}, + "K50-Precise": {"temperature": 0.1, "top_p": 0.95, "top_k": 50, "repetition_penalty": 1.0,}, + "K50-Original": {"temperature": 0.9, "top_p": 0.95, "top_k": 50, "repetition_penalty": 1.0,}, + "Nucleus9": {"temperature": 0.8, "top_p": 0.9, "top_k": 10000, "repetition_penalty": 1.0,}, + "Custom": {"temperature": 0.75, "top_p": 0.95, "top_k": 50, "repetition_penalty": 1.0,}, +} def check_gradio_import(): @@ -48,55 +73,25 @@ def check_gradio_import(): def create_gen_function(port=5555, chat=False): - if chat: - - def get_generation( - prompt, preamble, greedy, add_BOS, token_to_gen, min_tokens, temp, top_p, top_k, repetition, end_strings - ): - if preamble is not None and preamble != '': - prompt = SYSTEM_TOKEN + preamble + prompt - data = { - "sentences": [prompt], - "tokens_to_generate": int(token_to_gen), - "temperature": temp, - "add_BOS": add_BOS, - "top_k": top_k, - "top_p": top_p, - "greedy": greedy, - "all_probs": False, - "repetition_penalty": repetition, - "min_tokens_to_generate": int(min_tokens), - "end_strings": [i.strip() for i in end_strings.split(',') if len(i) != 0], - } - response = text_generation(data, port=port) - sentences = response['sentences'] - bot_message = sentences[0] - bot_message = bot_message[len(prompt) :] - return bot_message - - else: - - def get_generation( - prompt, greedy, add_BOS, token_to_gen, min_tokens, temp, top_p, top_k, repetition, end_strings - ): - data = { - "sentences": [prompt], - "tokens_to_generate": int(token_to_gen), - "temperature": temp, - "add_BOS": add_BOS, - "top_k": top_k, - "top_p": top_p, - "greedy": greedy, - "all_probs": False, - "repetition_penalty": repetition, - "min_tokens_to_generate": int(min_tokens), - "end_strings": [i.strip() for i in end_strings.split(',') if len(i) != 0], - } - response = text_generation(data, port=port) - sentences = response['sentences'] - bot_message = sentences[0] - bot_message = bot_message[len(prompt) :] - return bot_message + def get_generation(prompt, greedy, add_BOS, token_to_gen, min_tokens, temp, top_p, top_k, repetition, end_strings): + data = { + "sentences": [prompt], + "tokens_to_generate": int(token_to_gen), + "temperature": temp, + "add_BOS": add_BOS, + "top_k": top_k, + "top_p": top_p, + "greedy": greedy, + "all_probs": False, + "repetition_penalty": repetition, + "min_tokens_to_generate": int(min_tokens), + "end_strings": [i.strip() for i in end_strings.split(',') if len(i) != 0], + } + response = text_generation(data, port=port) + sentences = response['sentences'] + bot_message = sentences[0] + bot_message = bot_message[len(prompt) :] + return bot_message return get_generation @@ -104,33 +99,54 @@ def get_generation( def get_demo(share, username, password, server_port=5555, web_port=9889, loop=None): check_gradio_import() asyncio.set_event_loop(loop) - with gr.Blocks() as demo: + with gr.Blocks(css=CSS) as demo: with gr.Row(): with gr.Column(scale=2, width=200): - greedy_flag = gr.Checkbox(label="Greedy") - add_BOS = gr.Checkbox(label="Add BOS token", value=False) + # store the mutliple turn conversation token_to_gen = gr.Number(label='Number of Tokens to generate', value=300, type=int) min_token_to_gen = gr.Number(label='Min number of Tokens to generate', value=1, type=int) - temperature = gr.Slider(minimum=0.0, maximum=10.0, value=1.0, label='Temperature', step=0.1) - top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.02, value=0.9, label='Top P') - top_k = gr.Slider(minimum=0, maximum=10000, step=2, value=0, label='Top K') + seed = gr.Number(label='Random seed', value=0, type=int) + end_strings = gr.Textbox(label="End strings (comma separated)", value=",", lines=1,) + add_BOS = gr.Checkbox(label="Add BOS token", value=False) + sampling_method = gr.Dropdown( + list(PRESETS.keys()), label='Sampling Presets', default='K50', value='K50' + ) + temperature = gr.Slider(minimum=0.0, maximum=5.0, value=0.75, label='Temperature', step=0.1) + top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.02, value=0.95, label='Top P') + top_k = gr.Slider(minimum=0, maximum=1024, step=2, value=50, label='Top K') + repetition_penality = gr.Slider( - minimum=1.0, maximum=5.0, step=0.02, value=1.2, label='Repetition penalty' + minimum=1.0, maximum=5.0, step=0.02, value=1.0, label='Repetition penalty' ) - end_strings = gr.Textbox(label="End strings (comma separated)", value="<|endoftext|>,", lines=1,) - with gr.Column(scale=1, min_width=800): - input_prompt = gr.Textbox( - label="Input", - value="Ariel was playing basketball. 1 of her shots went in the hoop. 2 of her shots did not go in the hoop. How many shots were there in total?", - lines=5, + + def set_sampling(x): + return list(PRESETS[x].values()) + + sampling_method.change( + set_sampling, inputs=[sampling_method], outputs=[temperature, top_p, top_k, repetition_penality] ) - output_box = gr.Textbox(value="", label="Output") - btn = gr.Button(value="Submit") - btn.click( - create_gen_function(server_port, chat=False), - inputs=[ - input_prompt, - greedy_flag, + + with gr.Column(scale=1, min_width=900): + text = gr.Textbox(label="Playground", value="", lines=60, placeholder="Type something here...",) + submit_btn = gr.Button("Generate") + clear = gr.Button("Clear") + + def on_submit( + prompt_text, + token_to_gen, + temperature, + top_p, + top_k, + repetition_penality, + seed, + end_strings, + add_BOS, + min_token_to_gen, + ): + + output = create_gen_function(server_port)( + prompt_text, + False, add_BOS, token_to_gen, min_token_to_gen, @@ -139,41 +155,142 @@ def get_demo(share, username, password, server_port=5555, web_port=9889, loop=No top_k, repetition_penality, end_strings, + ) + print(output) + print('-------------------') + return prompt_text + output + + def clear_fun(): + return '' + + submit_btn.click( + on_submit, + [ + text, + token_to_gen, + temperature, + top_p, + top_k, + repetition_penality, + seed, + end_strings, + add_BOS, + min_token_to_gen, ], - outputs=[output_box], + [text], + queue=False, ) - demo.launch(share=share, server_port=web_port, server_name='0.0.0.0', auth=(username, password)) + clear.click(clear_fun, None, text, queue=False) + demo.queue(concurrency_count=16).launch( + share=share, server_port=web_port, server_name='0.0.0.0', auth=(username, password) + ) -def get_chatbot_demo(share, username, password, server_port=5555, web_port=9889, loop=None): +def get_chatbot_demo( + share, username, password, server_port=5555, web_port=9889, loop=None, value=False, defaults=None +): check_gradio_import() from nemo.collections.nlp.modules.common.chatbot_component import Chatbot asyncio.set_event_loop(loop) with gr.Blocks(css=CSS) as demo: - # store the mutliple turn conversation with gr.Row(): with gr.Column(scale=2, width=200): # store the mutliple turn conversation session_state = gr.State(value=[]) - greedy_flag = gr.Checkbox(label="Greedy", value=True) - add_BOS = gr.Checkbox(label="Add BOS token", value=False) token_to_gen = gr.Number(label='Number of Tokens to generate', value=300, type=int) - min_token_to_gen = gr.Number(label='Min number of Tokens to generate', value=1, type=int) - temperature = gr.Slider(minimum=0.0, maximum=10.0, value=1.0, label='Temperature', step=0.1) - top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.02, value=0.9, label='Top P') - top_k = gr.Slider(minimum=0, maximum=10000, step=2, value=0, label='Top K') - repetition_penality = gr.Slider( - minimum=1.0, maximum=5.0, step=0.02, value=1.2, label='Repetition penalty' + seed = gr.Number(label='Random seed', value=0, type=int) + prompt_presets = gr.Dropdown( + list(PROMPT_PRESETS.keys()), label='Template Presets', default='DIALOGUE2', value='DIALOGUE2' + ) + sampling_method = gr.Dropdown( + list(PRESETS.keys()), label='Sampling Presets', default='K50', value='K50' ) - end_strings = gr.Textbox( - label="End strings (comma separated)", value=f"<|endoftext|>,,", lines=1, + with gr.Accordion("Sampling Parameters", open=False): + temperature = gr.Slider( + minimum=0.0, maximum=5.0, value=0.75, label='Temperature', step=0.1, interactive=False + ) + top_p = gr.Slider( + minimum=0.0, maximum=1.0, step=0.02, value=0.95, label='Top P', interactive=False + ) + top_k = gr.Slider(minimum=0, maximum=1024, step=2, value=50, label='Top K', interactive=False) + repetition_penality = gr.Slider( + minimum=1.0, maximum=5.0, step=0.02, value=1.0, label='Repetition penalty', interactive=False + ) + + with gr.Accordion("Value Parameters", open=True, visible=value): + keys = ['quality', 'toxicity', 'humor', 'creativity', 'violence', 'helpfulness', 'not_appropriate'] + quality_value = gr.Slider( + minimum=0, maximum=9, step=1, value=9, label='Quality', interactive=True, visible=True + ) + toxicity_value = gr.Slider( + minimum=0, maximum=9, step=1, value=0, label='Toxicity', interactive=True, visible=True + ) + humor_value = gr.Slider( + minimum=0, maximum=9, step=1, value=0, label='Humor', interactive=True, visible=True + ) + creativity_value = gr.Slider( + minimum=0, maximum=9, step=1, value=0, label='Creativity', interactive=True, visible=True + ) + violence_value = gr.Slider( + minimum=0, maximum=9, step=1, value=0, label='Violence', interactive=True, visible=True + ) + helpfulness_value = gr.Slider( + minimum=0, maximum=9, step=1, value=9, label='Helpfulness', interactive=True, visible=True + ) + not_appropriate_value = gr.Slider( + minimum=0, maximum=9, step=1, value=0, label='Not Appropriate', interactive=True, visible=True + ) + used_value = gr.CheckboxGroup(keys, value=keys) + + def change_visibility(x): + values = [] + for key in keys: + if key in x: + values.append(gr.update(visible=True)) + else: + values.append(gr.update(visible=False)) + return values + + used_value.change( + change_visibility, + inputs=[used_value], + outputs=[ + quality_value, + toxicity_value, + humor_value, + creativity_value, + violence_value, + helpfulness_value, + not_appropriate_value, + ], + ) + + def set_sampling(x): + if x == 'Custom': + values = [gr.update(value=v, interactive=True) for v in PRESETS[x].values()] + return values + else: + values = [gr.update(value=v, interactive=False) for v in PRESETS[x].values()] + return values + + sampling_method.change( + set_sampling, inputs=[sampling_method], outputs=[temperature, top_p, top_k, repetition_penality] ) - gr.HTML("


") - human_name = gr.Textbox(label="Human Name", value="User", line=1,) - assistant_name = gr.Textbox(label="Assistant Name", value="Assistant", line=1,) - preamble = gr.Textbox(label="System", value=DEFAULT_SYSTEM, lines=2,) - with gr.Column(scale=1, min_width=800): + + gr.HTML("
") + human_name = gr.Textbox(label="Human Name", value=defaults['user'], line=1,) + assistant_name = gr.Textbox(label="Assistant Name", value=defaults['assistant'], line=1,) + preamble = gr.Textbox(label="System", value=defaults['system'], lines=2,) + + def set_prompt(x): + if x == "DIALOGUE": + return '', '' + return defaults['user'], defaults['assistant'] + + prompt_presets.change(set_prompt, inputs=[prompt_presets], outputs=[human_name, assistant_name]) + + with gr.Column(scale=1, min_width=900): chatbot = Chatbot(elem_id="chatbot").style(height=800) msg = gr.Textbox(label="User", value="", lines=1,) clear = gr.Button("Clear") @@ -183,45 +300,86 @@ def user(user_message, history, session_state): user_message = user_message.replace('\n', '
') return "", history + [[user_message, None]] + def get_value_str(values_array, used_value): + if len(used_value) == 0: + return '' + assert len(values_array) == len(keys) + value_str = '' + elements = [] + for i, key in enumerate(keys): + if key in used_value: + elements.append(f'{key}:{values_array[i]}') + value_str += ','.join(elements) + '\n' + return value_str + def bot( history, preamble, - greedy_flag, - add_BOS, token_to_gen, - min_token_to_gen, temperature, top_p, top_k, repetition_penality, - end_strings, + seed, human_name, assistant_name, session_state, + prompts_presets, + quality_value, + toxicity_value, + humor_value, + creativity_value, + violence_value, + helpfulness_value, + not_appropriate_value, + used_value, ): + + values_array = [ + quality_value, + toxicity_value, + humor_value, + creativity_value, + violence_value, + helpfulness_value, + not_appropriate_value, + ] + if value: + value_str = get_value_str(values_array, used_value) + else: + value_str = '' + + prompt_preset = PROMPT_PRESETS[prompts_presets] prompt_text = '' names = [human_name, assistant_name] + turn_tokens = [prompt_preset['USER_TURN_TOKEN'], prompt_preset['BOT_TURN_TOKEN']] for i, meg in enumerate(session_state): name = names[i % 2] - prompt_text += TURN_TOKEN + name + '\n' + meg + '\n' - prompt_text += TURN_TOKEN + assistant_name + '\n' - bot_message = create_gen_function(server_port, chat=True)( + turn = turn_tokens[i % 2] + prompt_text += turn + name + prompt_preset['END_OF_NAME'] + meg + prompt_preset['END_OF_TURN'] + prompt_text += ( + prompt_preset['BOT_TURN_TOKEN'] + assistant_name + prompt_preset['END_OF_NAME'] + value_str + ) + prompt_text = prompt_preset['SYSTEM_TURN_TOKEN'] + preamble + prompt_text + bot_message = create_gen_function(server_port)( prompt_text, - preamble, - greedy_flag, - add_BOS, + False, + False, token_to_gen, - min_token_to_gen, + 1, temperature, top_p, top_k, repetition_penality, - end_strings, + '', ) if bot_message.endswith(TURN_TOKEN): bot_message = bot_message[: -len(TURN_TOKEN)] history[-1][1] = bot_message - session_state.append(bot_message.strip()) + print(prompt_text) + print(bot_message) + print('-------------------') + session_state.append(value_str + bot_message.strip()) return history msg.submit(user, [msg, chatbot, session_state], [msg, chatbot], queue=False).then( @@ -229,20 +387,26 @@ def bot( [ chatbot, preamble, - greedy_flag, - add_BOS, token_to_gen, - min_token_to_gen, temperature, top_p, top_k, repetition_penality, - end_strings, + seed, human_name, assistant_name, session_state, + prompt_presets, + quality_value, + toxicity_value, + humor_value, + creativity_value, + violence_value, + helpfulness_value, + not_appropriate_value, + used_value, ], - chatbot, + [chatbot], ) def clear_fun(session_state): From 47c9d743b05fd5db63a095ab5448e0fc52197e5b Mon Sep 17 00:00:00 2001 From: Kunal Dhawan Date: Thu, 29 Jun 2023 08:51:01 -0700 Subject: [PATCH 076/123] Fix confidence ensembles RNNT logprobs selection logic for exclude_blank scenario (#6937) * fixed rnnt logprob selection logic for exclude_blank scenario Signed-off-by: KunalDhawan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix all blank ctc edge case Signed-off-by: KunalDhawan --------- Signed-off-by: KunalDhawan Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- nemo/collections/asr/models/confidence_ensemble.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/models/confidence_ensemble.py b/nemo/collections/asr/models/confidence_ensemble.py index 0a5441a1cd52..cd4738e7b97c 100644 --- a/nemo/collections/asr/models/confidence_ensemble.py +++ b/nemo/collections/asr/models/confidence_ensemble.py @@ -86,9 +86,10 @@ def get_filtered_logprobs(hypothesis: Hypothesis, exclude_blank: bool) -> torch. filtered_logprobs = [] for alignment in hypothesis.alignments: for align_elem in alignment: - if exclude_blank and align_elem[1].item() != align_elem[0].shape[-1] - 1: + if not exclude_blank: + filtered_logprobs.append(align_elem[0]) + elif align_elem[1].item() != align_elem[0].shape[-1] - 1: filtered_logprobs.append(align_elem[0]) - filtered_logprobs.append(align_elem[0]) if not filtered_logprobs: # for the edge-case of all blanks filtered_logprobs.append(align_elem[0]) filtered_logprobs = torch.stack(filtered_logprobs) @@ -101,6 +102,8 @@ def get_filtered_logprobs(hypothesis: Hypothesis, exclude_blank: bool) -> torch. if exclude_blank: # filtering blanks labels = logprobs.argmax(dim=-1) filtered_logprobs = logprobs[labels != logprobs.shape[1] - 1] + if filtered_logprobs.shape[0] == 0: # for the edge-case of all blanks + filtered_logprobs = logprobs[:1] else: filtered_logprobs = logprobs return filtered_logprobs @@ -136,6 +139,7 @@ def compute_confidence(hypothesis: Hypothesis, confidence_cfg: ConfidenceConfig) conf_func = get_confidence_measure_bank()[conf_type] conf_value = aggr_func(conf_func(filtered_logprobs, v=vocab_size, t=alpha)).cpu().item() + return conf_value From 7e6e04a6f5e84aa928a984fdfe9f896f1add3278 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 29 Jun 2023 15:06:20 -0600 Subject: [PATCH 077/123] Add ub communicator initialization to validation step (#6814) * Add ub communicator initialization to validation step (#6807) * fix code qol Signed-off-by: ericharper --------- Signed-off-by: ericharper Co-authored-by: Sangkug Lym Co-authored-by: ericharper --- .../language_modeling/megatron_gpt_model.py | 55 ++++++++++--------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 817ef0bd6442..44b484b28949 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -22,7 +22,6 @@ import torch from omegaconf.dictconfig import DictConfig from pytorch_lightning.accelerators import CPUAccelerator -from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin from pytorch_lightning.trainer.trainer import Trainer from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import ( @@ -53,7 +52,6 @@ SamplingParam, TextGeneration, ) -from nemo.collections.nlp.parts.nlp_overrides import GradScaler from nemo.collections.nlp.parts.utils_funcs import get_last_rank from nemo.core.classes import Exportable from nemo.core.classes.common import PretrainedModelInfo @@ -512,37 +510,38 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): return loss_mean + def initialize_ub_func(self): + input_shape = [ + self.cfg.get('encoder_seq_length') * self.cfg.get('micro_batch_size'), + self.cfg.get('hidden_size'), + ] + ub_cfg_file_name = self.cfg.get('ub_tp_comm_overlap_cfg', None) + ub_cfgs = None + if ub_cfg_file_name is not None: + try: + import yaml + + with open(ub_cfg_file_name, 'r') as ub_cfg_file: + ub_cfgs = yaml.safe_load(ub_cfg_file) + except (ImportError, TypeError): + logging.error(f"Fail to read ub_tp_comm_overlap config file: {ub_cfg_file_name}.") + te_module.initialize_ub( + shape=input_shape, + tp_size=self.cfg.get('tensor_model_parallel_size'), + use_fp8=self.cfg.get('fp8'), + ub_cfgs=ub_cfgs, + ) + self.initialize_ub = False + def training_step(self, dataloader_iter, batch_idx): """ We pass the dataloader iterator function to the micro-batch scheduler. The input batch to each micro-batch is fetched using the dataloader function in the micro-batch fwd function. """ - # Initialize userbuffer communicators. Initialization is done only once at the - # beginning of the first training step. + # Initialize userbuffer communicators. if self.initialize_ub: - input_shape = [ - self.cfg.get('encoder_seq_length') * self.cfg.get('micro_batch_size'), - self.cfg.get('hidden_size'), - ] - ub_cfg_file_name = self.cfg.get('ub_tp_comm_overlap_cfg', None) - ub_cfgs = None - if ub_cfg_file_name is not None: - try: - import yaml - - with open(ub_cfg_file_name, 'r') as ub_cfg_file: - ub_cfgs = yaml.safe_load(ub_cfg_file) - except (ImportError, TypeError): - print("Fail to read ub_tp_comm_overlap config file.") - - te_module.initialize_ub( - shape=input_shape, - tp_size=self.cfg.get('tensor_model_parallel_size'), - use_fp8=self.cfg.get('fp8'), - ub_cfgs=ub_cfgs, - ) - self.initialize_ub = False + self.initialize_ub_func() if self.rampup_batch_size: num_microbatch_calculator = apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR @@ -873,6 +872,10 @@ def validation_step(self, dataloader_iter, batch_idx): from the dataloader to produce a list of microbatches. The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions. """ + # Initialize userbuffer communicators. + if self.initialize_ub: + self.initialize_ub_func() + if isinstance(self.model, list): for model_module in self.model: model_module.eval() From 89225e4105a939050a8ba973ea7abbc1aab543cb Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Thu, 29 Jun 2023 14:58:13 -0700 Subject: [PATCH 078/123] Add missing save restore connector to eval scripts (#6935) Signed-off-by: smajumdar Co-authored-by: Eric Harper --- examples/nlp/language_modeling/megatron_gpt_eval.py | 8 +++++++- examples/nlp/language_modeling/megatron_t5_eval.py | 10 +++++++++- .../language_modeling/tuning/megatron_gpt_ia3_eval.py | 2 +- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index dc299fccdf90..2a6890e1a9b4 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -174,8 +174,14 @@ def main(cfg) -> None: or cfg.pipeline_model_parallel_size < 0 or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 ): + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(cfg.gpt_model_file): + save_restore_connector.model_extracted_dir = cfg.gpt_model_file model_config = MegatronGPTModel.restore_from( - restore_path=cfg.gpt_model_file, trainer=trainer, return_config=True, + restore_path=cfg.gpt_model_file, + trainer=trainer, + return_config=True, + save_restore_connector=save_restore_connector, ) with open_dict(cfg): diff --git a/examples/nlp/language_modeling/megatron_t5_eval.py b/examples/nlp/language_modeling/megatron_t5_eval.py index 0282f9fb2913..0b6ea54b6b99 100644 --- a/examples/nlp/language_modeling/megatron_t5_eval.py +++ b/examples/nlp/language_modeling/megatron_t5_eval.py @@ -13,6 +13,7 @@ # limitations under the License. +import os from argparse import ArgumentParser import torch @@ -61,8 +62,15 @@ def main(): or args.pipeline_model_parallel_size < 0 or args.pipeline_model_parallel_split_rank < 0 ): + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(args.model_file): + save_restore_connector.model_extracted_dir = args.model_file + model_config = MegatronT5Model.restore_from( - restore_path=args.model_file, trainer=Trainer(strategy=NLPDDPStrategy()), return_config=True, + restore_path=args.model_file, + trainer=Trainer(strategy=NLPDDPStrategy()), + return_config=True, + save_restore_connector=save_restore_connector, ) args.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_eval.py b/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_eval.py index a676fee00a7e..a30818f29fb3 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_eval.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_eval.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. - +import os import torch import torch.multiprocessing as mp from megatron.core import parallel_state From 0dee17b40f6e3af2333c4cd4ba5fa34fe91994fb Mon Sep 17 00:00:00 2001 From: Vahid Noroozi Date: Thu, 29 Jun 2023 15:04:39 -0700 Subject: [PATCH 079/123] added cache-aware checkpoints. (#6940) Signed-off-by: vnoroozi --- docs/source/asr/data/benchmark_en.csv | 5 ++++- .../asr/data/scores/en/conformer_en.csv | 6 ++++++ .../asr/models/hybrid_rnnt_ctc_bpe_models.py | 21 +++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/docs/source/asr/data/benchmark_en.csv b/docs/source/asr/data/benchmark_en.csv index 5c764ba38651..684d9f9fa76d 100644 --- a/docs/source/asr/data/benchmark_en.csv +++ b/docs/source/asr/data/benchmark_en.csv @@ -31,4 +31,7 @@ stt_en_fastconformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog stt_en_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_pc" stt_en_fastconformer_transducer_xlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xlarge" stt_en_fastconformer_ctc_xlarge,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_xlarge" -stt_en_fastconformer_transducer_xxlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xxlarge" \ No newline at end of file +stt_en_fastconformer_transducer_xxlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xxlarge" +stt_en_fastconformer_hybrid_large_streaming_80ms,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_streaming_80ms" +stt_en_fastconformer_hybrid_large_streaming_480ms,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_streaming_480ms" +stt_en_fastconformer_hybrid_large_streaming_1040ms,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_streaming_1040ms" \ No newline at end of file diff --git a/docs/source/asr/data/scores/en/conformer_en.csv b/docs/source/asr/data/scores/en/conformer_en.csv index 905bdf2ebedc..d77f0a687ce8 100644 --- a/docs/source/asr/data/scores/en/conformer_en.csv +++ b/docs/source/asr/data/scores/en/conformer_en.csv @@ -12,3 +12,9 @@ stt_en_conformer_transducer_large,en,,,1.6,3.5,1.7,3.7,,,,,,,,,,,, stt_en_conformer_transducer_large_ls,en,,,2.1,5.0,2.3,5.1,,,,,,,,,,,, stt_en_conformer_transducer_xlarge,en,,,1.48 %,2.95 %,1.62 %,3.01 %,,6.46 %,4.59 %,5.32 %,5.70 %,6.47 %,21.32 %,,,,2.05 %,1.17 % stt_en_conformer_transducer_xxlarge,en,,,1.52 %,3.09 %,1.72 %,3.14 %,,,5.29 %,5.85 %,6.64 %,,,,,,2.42 %,1.49 % +stt_en_fastconformer_hybrid_large_streaming_80ms (CTC),en,,,,,3.5 %,8.1 %,,,10.2 %,7.2 %,,,,,,,3.5 %,2.3 % +stt_en_fastconformer_hybrid_large_streaming_480ms (CTC),en,,,,,3.6 %,7.5 %,,,9.8 %,7.0 %,,,,,,,3.5 %,2.1 % +stt_en_fastconformer_hybrid_large_streaming_1040ms (CTC),en,,,,,2.7 %,6.4 %,,,9.0 %,7.0 %,,,,,,,3.2 %,1.9 % +stt_en_fastconformer_hybrid_large_streaming_80ms (RNNT),en,,,,,2.7 %,6.5 %,,,9.1 %,6.9 %,,,,,,,3.2 %,1.9 % +stt_en_fastconformer_hybrid_large_streaming_480ms (RNNT),en,,,,,2.7 %,6.1 %,,,8.5 %,6.7 %,,,,,,,3.1 %,1.8 % +stt_en_fastconformer_hybrid_large_streaming_1040ms (RNNT),en,,,,,2.3 %,5.5 %,,,8.0 %,6.6 %,,,,,,,2.9 %,1.6 % diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index 6637486f18dc..6604983b6461 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -523,4 +523,25 @@ def list_available_models(cls) -> List[PretrainedModelInfo]: ) results.append(model) + model = PretrainedModelInfo( + pretrained_model_name="stt_en_fastconformer_hybrid_large_streaming_80ms", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_streaming_80ms", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_hybrid_large_streaming_80ms/versions/1.20.0/files/stt_en_fastconformer_hybrid_large_streaming_80ms.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_fastconformer_hybrid_large_streaming_480ms", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_streaming_480ms", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_hybrid_large_streaming_480ms/versions/1.20.0/files/stt_en_fastconformer_hybrid_large_streaming_480ms.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_fastconformer_hybrid_large_streaming_1040ms", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_streaming_1040ms", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_hybrid_large_streaming_1040ms/versions/1.20.0/files/stt_en_fastconformer_hybrid_large_streaming_1040ms.nemo", + ) + results.append(model) + return results From 85e3e151f30f6c76409d3d2856b3fa795f8da7ff Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Thu, 29 Jun 2023 16:07:40 -0600 Subject: [PATCH 080/123] Merge release r1.19.0 into main (#6948) * Cut branch r1.19.0 Signed-off-by: smajumdar * Tutorial fixes (#6717) Signed-off-by: smajumdar * fix notebook error (#6840) Signed-off-by: Yi Dong * fix (#6842) Signed-off-by: Yi Dong * update branch Signed-off-by: ericharper * update package info Signed-off-by: ericharper --------- Signed-off-by: smajumdar Signed-off-by: Yi Dong Signed-off-by: ericharper Co-authored-by: Somshubra Majumdar Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com> --- nemo/package_info.py | 2 +- ...on_Synthetic_Tabular_Data_Generation.ipynb | 21 +++++++++++++++++++ .../tools/CTC_Segmentation_Tutorial.ipynb | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/nemo/package_info.py b/nemo/package_info.py index 709159dd575a..17efc924de32 100644 --- a/nemo/package_info.py +++ b/nemo/package_info.py @@ -14,7 +14,7 @@ MAJOR = 1 -MINOR = 19 +MINOR = 20 PATCH = 0 PRE_RELEASE = 'rc0' diff --git a/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb b/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb index b7ae11ef3f5d..bfd3c7094198 100644 --- a/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb +++ b/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "c3217a15", "metadata": {}, @@ -15,6 +16,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8c72dc42", "metadata": {}, @@ -25,6 +27,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "79154a9e", "metadata": {}, @@ -73,6 +76,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7e0bbc89", "metadata": {}, @@ -92,6 +96,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "1ff1d46f", "metadata": {}, @@ -141,6 +146,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "aa356012", "metadata": {}, @@ -239,6 +245,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "02bff63f", "metadata": {}, @@ -267,6 +274,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "89e1e5b3", "metadata": {}, @@ -339,6 +347,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "05ebadc3", "metadata": {}, @@ -347,6 +356,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2fe38a29", "metadata": {}, @@ -381,6 +391,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "678f65ef", "metadata": {}, @@ -411,6 +422,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8af66b4a", "metadata": {}, @@ -464,6 +476,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "6ecec681", "metadata": {}, @@ -472,6 +485,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "58a3d4fa", "metadata": {}, @@ -543,6 +557,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "45ac928f", "metadata": {}, @@ -557,6 +572,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "158a4bbe", "metadata": {}, @@ -597,6 +613,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "fa16378e", "metadata": {}, @@ -605,6 +622,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "ed056ec6", "metadata": {}, @@ -630,6 +648,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a62b48dc", "metadata": {}, @@ -685,6 +704,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cccd54d9", "metadata": {}, @@ -790,6 +810,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "0f2f6e3a", "metadata": {}, diff --git a/tutorials/tools/CTC_Segmentation_Tutorial.ipynb b/tutorials/tools/CTC_Segmentation_Tutorial.ipynb index 98f0cce4e9ec..15a82a36a1b2 100644 --- a/tutorials/tools/CTC_Segmentation_Tutorial.ipynb +++ b/tutorials/tools/CTC_Segmentation_Tutorial.ipynb @@ -280,7 +280,7 @@ "* `max_length` argument - max number of words in a segment for alignment (used only if there are no punctuation marks present in the original text. Long non-speech segments are better for segments split and are more likely to co-occur with punctuation marks. Random text split could deteriorate the quality of the alignment.\n", "* out-of-vocabulary words will be removed based on pre-trained ASR model vocabulary, and the text will be changed to lowercase \n", "* sentences for alignment with the original punctuation and capitalization will be stored under `$OUTPUT_DIR/processed/*_with_punct.txt`\n", - "* numbers will be converted from written to their spoken form with `num2words` package. For English, it's recommended to use NeMo normalization tool use `--use_nemo_normalization` argument (not supported if running this segmentation tutorial in Colab, see the text normalization tutorial: [`https://github.com/NVIDIA/NeMo-text-processing/blob/main/tutorials/Text_(Inverse)_Normalization.ipynb`](https://colab.research.google.com/github/NVIDIA/NeMo-text-processing/blob/main/tutorials/Text_(Inverse)_Normalization.ipynb) for more details). Even `num2words` normalization is usually enough for proper segmentation. However, it does not take audio into account. NeMo supports audio-based normalization for English, German and Russian languages that can be applied to the segmented data as a post-processing step. Audio-based normalization produces multiple normalization options. For example, `901` could be normalized as `nine zero one` or `nine hundred and one`. The audio-based normalization chooses the best match among the possible normalization options and the transcript based on the character error rate. See [https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/normalize_with_audio.py](https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/normalize_with_audio.py) for more details.\n", + "* numbers will be converted from written to their spoken form with `num2words` package. For English, it's recommended to use NeMo normalization tool use `--use_nemo_normalization` argument (not supported if running this segmentation tutorial in Colab, see the text normalization tutorial: [`https://github.com/NVIDIA/NeMo-text-processing/blob/r1.19.0/tutorials/Text_(Inverse)_Normalization.ipynb`](https://colab.research.google.com/github/NVIDIA/NeMo-text-processing/blob/r1.19.0/tutorials/Text_(Inverse)_Normalization.ipynb) for more details). Even `num2words` normalization is usually enough for proper segmentation. However, it does not take audio into account. NeMo supports audio-based normalization for English, German and Russian languages that can be applied to the segmented data as a post-processing step. Audio-based normalization produces multiple normalization options. For example, `901` could be normalized as `nine zero one` or `nine hundred and one`. The audio-based normalization chooses the best match among the possible normalization options and the transcript based on the character error rate. See [https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/normalize_with_audio.py](https://github.com/NVIDIA/NeMo-text-processing/blob/r1.19.0/nemo_text_processing/text_normalization/normalize_with_audio.py) for more details.\n", "\n", "### Audio preprocessing:\n", "* non '.wav' audio files will be converted to `.wav` format\n", From e5706a0e70be2cb6abfd5698ab5f7be40860127d Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Fri, 30 Jun 2023 11:35:57 -0600 Subject: [PATCH 081/123] upgrade base container (#6938) Signed-off-by: ericharper --- Dockerfile | 4 ++-- Jenkinsfile | 2 +- README.rst | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7722555357b2..3aa4c39d6a4d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.04-py3 +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.06-py3 # build an image that includes only the nemo dependencies, ensures that dependencies # are included first for optimal caching, and useful for building a development @@ -94,7 +94,7 @@ COPY . . # start building the final container FROM nemo-deps as nemo -ARG NEMO_VERSION=1.19.0 +ARG NEMO_VERSION=1.20.0 # Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container # version information as runtime environment variable for introspection purposes diff --git a/Jenkinsfile b/Jenkinsfile index 1a79d87bcd38..be62291daf24 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,7 +1,7 @@ pipeline { agent { docker { - image 'nvcr.io/nvidia/pytorch:23.04-py3' + image 'nvcr.io/nvidia/pytorch:23.06-py3' args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=1' } } diff --git a/README.rst b/README.rst index 8a788da71550..7ac95b8cef70 100644 --- a/README.rst +++ b/README.rst @@ -319,13 +319,13 @@ To build a nemo container with Dockerfile from a branch, please run DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest . -If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.04-py3 and then installing from GitHub. +If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.06-py3 and then installing from GitHub. .. code-block:: bash docker run --gpus all -it --rm -v :/NeMo --shm-size=8g \ -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \ - stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.04-py3 + stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.06-py3 Examples -------- From b0e5bf3627dbcfb3f4a72d73d3c5e92184d8b1f6 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Fri, 30 Jun 2023 17:32:52 -0700 Subject: [PATCH 082/123] Fix requirements for pydantic + inflect (#6956) * Fix requirements for pydantic + inflect Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: smajumdar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- requirements/requirements_asr.txt | 1 - requirements/requirements_common.txt | 2 ++ requirements/requirements_nlp.txt | 1 - requirements/requirements_tts.txt | 1 - 4 files changed, 2 insertions(+), 3 deletions(-) diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt index fdeaeb2d450d..011862ad723b 100644 --- a/requirements/requirements_asr.txt +++ b/requirements/requirements_asr.txt @@ -1,7 +1,6 @@ braceexpand editdistance g2p_en -inflect ipywidgets jiwer kaldi-python-io diff --git a/requirements/requirements_common.txt b/requirements/requirements_common.txt index 29d8ac4dd49b..a4d343a32d1a 100644 --- a/requirements/requirements_common.txt +++ b/requirements/requirements_common.txt @@ -1,4 +1,6 @@ +inflect pandas +pydantic<2 # remove after inflect supports Pydantic 2.0+ sacremoses>=0.0.43 sentencepiece<1.0.0 youtokentome>=1.0.5 diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 2018de6fbc31..68d8b8985748 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -7,7 +7,6 @@ ftfy gdown h5py ijson -inflect jieba markdown2 matplotlib>=3.3.2 diff --git a/requirements/requirements_tts.txt b/requirements/requirements_tts.txt index 20484871ee4b..bb330aaf2e58 100644 --- a/requirements/requirements_tts.txt +++ b/requirements/requirements_tts.txt @@ -1,6 +1,5 @@ attrdict einops -inflect jieba kornia librosa From 0b6e4e61bd23cbf9704dac431756d491adab084d Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Mon, 3 Jul 2023 08:01:35 -0700 Subject: [PATCH 083/123] Update distopt API for coalesced NCCL calls (#6886) * Update distopt API for coalesced NCCL calls Signed-off-by: Tim Moon * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update comment Signed-off-by: Tim Moon --------- Signed-off-by: Tim Moon Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Dockerfile | 6 +++--- nemo/core/optim/distributed_adam.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3aa4c39d6a4d..2e6b617087bc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,11 +45,11 @@ RUN apt-get update && \ WORKDIR /workspace/ WORKDIR /tmp/ -# TODO: Remove once this Apex commit (2/24/23) is included in PyTorch +# TODO: Remove once this Apex commit (5/12/23) is included in PyTorch # container RUN git clone https://github.com/NVIDIA/apex.git && \ cd apex && \ - git checkout 57057e2fcf1c084c0fcc818f55c0ff6ea1b24ae2 && \ + git checkout 8b7a1ff183741dd8f9b87e7bafd04cfde99cea28 && \ pip3 install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./ # uninstall stuff from base container @@ -75,7 +75,7 @@ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-chec # install flash attention dependencies RUN pip install flash-attn # pinned triton version for flash-attention https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3 -RUN pip install triton==2.0.0.dev20221202 +RUN pip install triton==2.0.0.dev20221202 # install k2, skip if installation fails COPY scripts /tmp/nemo/scripts/ diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py index 1f2ce90f3ff7..8c3b0a30658f 100644 --- a/nemo/core/optim/distributed_adam.py +++ b/nemo/core/optim/distributed_adam.py @@ -19,6 +19,7 @@ from apex.contrib.optimizers.distributed_fused_adam import ( DistributedFusedAdam, _coalescing_manager, + _coalescing_manager_append_work, _disable_pre_forward_hook, ) from megatron.core import parallel_state @@ -173,16 +174,15 @@ def _fp32_optim_grad_sync(self): for model_param, main_param in self._fp32_optim_main_params.items(): if model_param.grad is not None: main_param.grad += model_param.grad.detach() - sync_requests = [] - with _coalescing_manager(self.process_group, self.device, sync_requests): + with _coalescing_manager(self.process_group, self.device, async_ops=True) as cm: for main_param in self._fp32_optim_main_params.values(): - sync_requests.append( + _coalescing_manager_append_work( + cm, torch.distributed.all_reduce( main_param.grad, op=torch.distributed.ReduceOp.AVG, group=self.process_group, async_op=True, - ) + ), ) - for req in sync_requests: - req.wait() + cm.wait() self._fp32_optim_grad_sync_needed = False def zero_grad(self, *args, **kwargs): From 17447184bdf026b2f88d81353998856170bc09bc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 5 Jul 2023 14:13:13 -0700 Subject: [PATCH 084/123] Remove `compute_on_step` from metrics (#6979) (#6981) * Remove `compute_on_step` from metrics * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove confusing log message * Update tests --------- Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- nemo/collections/asr/metrics/rnnt_wer.py | 2 +- nemo/collections/asr/metrics/rnnt_wer_bpe.py | 2 +- nemo/collections/asr/metrics/wer.py | 2 +- nemo/collections/asr/metrics/wer_bpe.py | 2 +- .../common/metrics/global_average_loss_metric.py | 9 ++------- nemo/collections/common/metrics/perplexity.py | 8 ++------ nemo/collections/nlp/metrics/sequence_perplexity.py | 9 ++------- .../nlp/models/language_modeling/bert_lm_model.py | 2 +- .../nlp/models/text2sparql/text2sparql_model.py | 2 +- nemo/core/optim/optimizers.py | 1 - tests/collections/common/pl_utils.py | 8 +++----- 11 files changed, 15 insertions(+), 32 deletions(-) diff --git a/nemo/collections/asr/metrics/rnnt_wer.py b/nemo/collections/asr/metrics/rnnt_wer.py index 55f9f4b5ea9f..7e5636191a1d 100644 --- a/nemo/collections/asr/metrics/rnnt_wer.py +++ b/nemo/collections/asr/metrics/rnnt_wer.py @@ -1224,7 +1224,7 @@ def validation_epoch_end(self, outputs): def __init__( self, decoding: RNNTDecoding, batch_dim_index=0, use_cer=False, log_prediction=True, dist_sync_on_step=False ): - super(RNNTWER, self).__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False) + super(RNNTWER, self).__init__(dist_sync_on_step=dist_sync_on_step) self.decoding = decoding self.batch_dim_index = batch_dim_index self.use_cer = use_cer diff --git a/nemo/collections/asr/metrics/rnnt_wer_bpe.py b/nemo/collections/asr/metrics/rnnt_wer_bpe.py index 0870eb180776..d2e2c3cc5923 100644 --- a/nemo/collections/asr/metrics/rnnt_wer_bpe.py +++ b/nemo/collections/asr/metrics/rnnt_wer_bpe.py @@ -359,7 +359,7 @@ def __init__( log_prediction: bool = True, dist_sync_on_step=False, ): - super(RNNTBPEWER, self).__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False) + super(RNNTBPEWER, self).__init__(dist_sync_on_step=dist_sync_on_step) self.decoding = decoding self.batch_dim_index = batch_dim_index self.use_cer = use_cer diff --git a/nemo/collections/asr/metrics/wer.py b/nemo/collections/asr/metrics/wer.py index 7f7f853d307d..4d90810cc3df 100644 --- a/nemo/collections/asr/metrics/wer.py +++ b/nemo/collections/asr/metrics/wer.py @@ -1125,7 +1125,7 @@ def __init__( fold_consecutive=True, dist_sync_on_step=False, ): - super().__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False) + super().__init__(dist_sync_on_step=dist_sync_on_step) self.decoding = decoding self.use_cer = use_cer diff --git a/nemo/collections/asr/metrics/wer_bpe.py b/nemo/collections/asr/metrics/wer_bpe.py index 762acf172a16..8a92e4745a1b 100644 --- a/nemo/collections/asr/metrics/wer_bpe.py +++ b/nemo/collections/asr/metrics/wer_bpe.py @@ -247,7 +247,7 @@ def __init__( fold_consecutive=True, dist_sync_on_step=False, ): - super().__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False) + super().__init__(dist_sync_on_step=dist_sync_on_step) self.decoding = decoding self.tokenizer = self.decoding.tokenizer self.blank_id = self.decoding.tokenizer.tokenizer.vocab_size diff --git a/nemo/collections/common/metrics/global_average_loss_metric.py b/nemo/collections/common/metrics/global_average_loss_metric.py index fae1dbfea5e8..3bbd4d13abf4 100644 --- a/nemo/collections/common/metrics/global_average_loss_metric.py +++ b/nemo/collections/common/metrics/global_average_loss_metric.py @@ -28,9 +28,6 @@ class GlobalAverageLossMetric(Metric): See :doc:`PyTorch Lightning Metrics` for the metric usage instruction. Args: - compute_on_step: - The method :meth:`forward` only calls ``update()`` and returns ``None`` if this is set to ``False``. - default: ``True`` dist_sync_on_step: Synchronize metric state across processes at each method :meth:`forward` call before returning the value at the step @@ -44,10 +41,8 @@ class GlobalAverageLossMetric(Metric): full_state_update = True - def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, take_avg_loss=True): - super().__init__( - compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group - ) + def __init__(self, dist_sync_on_step=False, process_group=None, take_avg_loss=True): + super().__init__(dist_sync_on_step=dist_sync_on_step, process_group=process_group) self.add_state("loss_sum", torch.tensor(0.0, dtype=torch.float64), dist_reduce_fx='sum') self.add_state("num_measurements", torch.tensor(0, dtype=torch.int64), dist_reduce_fx='sum') self.take_avg_loss = take_avg_loss diff --git a/nemo/collections/common/metrics/perplexity.py b/nemo/collections/common/metrics/perplexity.py index 1158e3408611..9e1c21737ec8 100644 --- a/nemo/collections/common/metrics/perplexity.py +++ b/nemo/collections/common/metrics/perplexity.py @@ -29,8 +29,6 @@ class Perplexity(Metric): See `PyTorch Lightning Metrics `_ for the metric usage instructions. Args: - compute_on_step: - Forward only calls ``update()`` and returns ``None`` if this is set to ``False``. default: ``True`` dist_sync_on_step: Synchronize metric state across processes at each ``forward()`` before returning the value at the step. @@ -44,10 +42,8 @@ class Perplexity(Metric): full_state_update = True - def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, validate_args=True): - super().__init__( - compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group - ) + def __init__(self, dist_sync_on_step=False, process_group=None, validate_args=True): + super().__init__(dist_sync_on_step=dist_sync_on_step, process_group=process_group) self.validate_args = validate_args self.add_state('perplexities_sum', torch.tensor(0.0, dtype=torch.float64), dist_reduce_fx='sum') # Total number of distributions seen since last reset diff --git a/nemo/collections/nlp/metrics/sequence_perplexity.py b/nemo/collections/nlp/metrics/sequence_perplexity.py index 688f9db87ea6..339f062f7cc1 100644 --- a/nemo/collections/nlp/metrics/sequence_perplexity.py +++ b/nemo/collections/nlp/metrics/sequence_perplexity.py @@ -31,8 +31,6 @@ class SequencePerplexity(Metric): See :doc:`PyTorch Lightning Metrics` for the metric usage instructions. Args: - compute_on_step: - Forward only calls ``update()`` and returns ``None`` if this is set to ``False``. default: ``True`` dist_sync_on_step: Synchronize metric state across processes at each ``forward()`` before returning the value at the step. process_group: @@ -43,12 +41,9 @@ class SequencePerplexity(Metric): to perform the allgather. """ - def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, dist_sync_fn=None): + def __init__(self, dist_sync_on_step=False, process_group=None, dist_sync_fn=None): super().__init__( - compute_on_step=compute_on_step, - dist_sync_on_step=dist_sync_on_step, - process_group=process_group, - dist_sync_fn=dist_sync_fn, + dist_sync_on_step=dist_sync_on_step, process_group=process_group, dist_sync_fn=dist_sync_fn, ) # Total sum of exponentiated average negative log likelihoods diff --git a/nemo/collections/nlp/models/language_modeling/bert_lm_model.py b/nemo/collections/nlp/models/language_modeling/bert_lm_model.py index 4c9d43c20d54..5cf509e77846 100644 --- a/nemo/collections/nlp/models/language_modeling/bert_lm_model.py +++ b/nemo/collections/nlp/models/language_modeling/bert_lm_model.py @@ -116,7 +116,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): # create extra bias # setup to track metrics - self.validation_perplexity = Perplexity(compute_on_step=False) + self.validation_perplexity = Perplexity() self.setup_optimization(cfg.optim) diff --git a/nemo/collections/nlp/models/text2sparql/text2sparql_model.py b/nemo/collections/nlp/models/text2sparql/text2sparql_model.py index 5290209b0c95..50046aef0344 100644 --- a/nemo/collections/nlp/models/text2sparql/text2sparql_model.py +++ b/nemo/collections/nlp/models/text2sparql/text2sparql_model.py @@ -100,7 +100,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): decoder=cfg.language_model.pretrained_decoder_model_name, ) - self.validation_perplexity = Perplexity(compute_on_step=False) + self.validation_perplexity = Perplexity() self.setup_optimization(cfg.optim) diff --git a/nemo/core/optim/optimizers.py b/nemo/core/optim/optimizers.py index 76e47e20e0cc..9473ef0af969 100644 --- a/nemo/core/optim/optimizers.py +++ b/nemo/core/optim/optimizers.py @@ -51,7 +51,6 @@ AVAILABLE_OPTIMIZERS['fused_adam'] = FusedAdam except ModuleNotFoundError: HAVE_APEX = False - logging.warning("Apex was not found. Using the lamb or fused_adam optimizer will error out.") HAVE_APEX_DISTRIBUTED_ADAM = False if HAVE_APEX: diff --git a/tests/collections/common/pl_utils.py b/tests/collections/common/pl_utils.py index 395c8cef5969..a2e9609c8492 100644 --- a/tests/collections/common/pl_utils.py +++ b/tests/collections/common/pl_utils.py @@ -90,7 +90,7 @@ def _class_test( calculated across devices for each batch (and not just at the end) """ # Instanciate lightning metric - metric = metric_class(compute_on_step=True, dist_sync_on_step=dist_sync_on_step, **metric_args) + metric = metric_class(dist_sync_on_step=dist_sync_on_step, **metric_args) # verify metrics work after being loaded from pickled state pickled_metric = pickle.dumps(metric) @@ -303,7 +303,7 @@ def _perplexity_class_test( calculated across devices for each batch (and not just at the end) """ # Instanciate lightning metric - perplexity = Perplexity(compute_on_step=True, dist_sync_on_step=dist_sync_on_step, **metric_args) + perplexity = Perplexity(dist_sync_on_step=dist_sync_on_step, **metric_args) if (probs is None) == (logits is None): with pytest.raises(ValueError): perplexity(probs, logits) @@ -464,9 +464,7 @@ def _loss_class_test( calculated across devices for each batch (and not just at the end) """ # Instantiate lightning metric - loss_metric = GlobalAverageLossMetric( - compute_on_step=True, dist_sync_on_step=dist_sync_on_step, take_avg_loss=take_avg_loss - ) + loss_metric = GlobalAverageLossMetric(dist_sync_on_step=dist_sync_on_step, take_avg_loss=take_avg_loss) # verify loss works after being loaded from pickled state pickled_metric = pickle.dumps(loss_metric) From 153c30780a2bc72f115b4c697cc303ddc1d451ea Mon Sep 17 00:00:00 2001 From: Sergii Dymchenko Date: Thu, 6 Jul 2023 19:40:29 -0700 Subject: [PATCH 085/123] Fix require_grad typos (#6930) Signed-off-by: Sergii Dymchenko --- .../modules/transformer/transformer_generators.py | 12 ++++++------ .../common/transformer/transformer_generators.py | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/nemo/collections/asr/modules/transformer/transformer_generators.py b/nemo/collections/asr/modules/transformer/transformer_generators.py index 504fdf076d3d..6e17151dcd1b 100644 --- a/nemo/collections/asr/modules/transformer/transformer_generators.py +++ b/nemo/collections/asr/modules/transformer/transformer_generators.py @@ -188,7 +188,7 @@ def freeze(self) -> None: param.requires_grad = False self.decoder.eval() for param in self.log_softmax.parameters(): - param.require_grad = False + param.requires_grad = False self.log_softmax.eval() def unfreeze(self) -> None: @@ -201,7 +201,7 @@ def unfreeze(self) -> None: param.requires_grad = True self.decoder.train() for param in self.log_softmax.parameters(): - param.require_grad = True + param.requires_grad = True self.log_softmax.train() @contextmanager @@ -701,10 +701,10 @@ def freeze(self) -> None: param.requires_grad = False self.decoders[model_num].eval() for param in self.log_softmaxes[model_num].parameters(): - param.require_grad = False + param.requires_grad = False self.log_softmaxes[model_num].eval() for param in self.encoders[model_num].parameters(): - param.require_grad = False + param.requires_grad = False self.encoders[model_num].eval() def unfreeze(self) -> None: @@ -718,10 +718,10 @@ def unfreeze(self) -> None: param.requires_grad = True self.decoders[model_num].train() for param in self.log_softmaxes[model_num].parameters(): - param.require_grad = True + param.requires_grad = True self.log_softmaxes[model_num].train() for param in self.encoders[model_num].parameters(): - param.require_grad = True + param.requires_grad = True self.encoders[model_num].train() @contextmanager diff --git a/nemo/collections/nlp/modules/common/transformer/transformer_generators.py b/nemo/collections/nlp/modules/common/transformer/transformer_generators.py index 504fdf076d3d..6e17151dcd1b 100644 --- a/nemo/collections/nlp/modules/common/transformer/transformer_generators.py +++ b/nemo/collections/nlp/modules/common/transformer/transformer_generators.py @@ -188,7 +188,7 @@ def freeze(self) -> None: param.requires_grad = False self.decoder.eval() for param in self.log_softmax.parameters(): - param.require_grad = False + param.requires_grad = False self.log_softmax.eval() def unfreeze(self) -> None: @@ -201,7 +201,7 @@ def unfreeze(self) -> None: param.requires_grad = True self.decoder.train() for param in self.log_softmax.parameters(): - param.require_grad = True + param.requires_grad = True self.log_softmax.train() @contextmanager @@ -701,10 +701,10 @@ def freeze(self) -> None: param.requires_grad = False self.decoders[model_num].eval() for param in self.log_softmaxes[model_num].parameters(): - param.require_grad = False + param.requires_grad = False self.log_softmaxes[model_num].eval() for param in self.encoders[model_num].parameters(): - param.require_grad = False + param.requires_grad = False self.encoders[model_num].eval() def unfreeze(self) -> None: @@ -718,10 +718,10 @@ def unfreeze(self) -> None: param.requires_grad = True self.decoders[model_num].train() for param in self.log_softmaxes[model_num].parameters(): - param.require_grad = True + param.requires_grad = True self.log_softmaxes[model_num].train() for param in self.encoders[model_num].parameters(): - param.require_grad = True + param.requires_grad = True self.encoders[model_num].train() @contextmanager From 19449a9a00c346d5900dbf3df3864e378c0db23e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 7 Jul 2023 14:18:45 -0400 Subject: [PATCH 086/123] fix the mpt chatbot (#6957) (#6968) Signed-off-by: Yi Dong Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com> --- .../nlp/modules/common/megatron_web_server.py | 3 +++ .../modules/common/text_generation_strategy.py | 16 ++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron_web_server.py b/nemo/collections/nlp/modules/common/megatron_web_server.py index 648bca024ba0..7c04ef201927 100644 --- a/nemo/collections/nlp/modules/common/megatron_web_server.py +++ b/nemo/collections/nlp/modules/common/megatron_web_server.py @@ -90,6 +90,9 @@ def get_generation(prompt, greedy, add_BOS, token_to_gen, min_tokens, temp, top_ response = text_generation(data, port=port) sentences = response['sentences'] bot_message = sentences[0] + if bot_message.find(' token + prompt = prompt.replace('', '').replace('', '').replace('', '') bot_message = bot_message[len(prompt) :] return bot_message diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index 8608c0c9a680..573bdc80735e 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -153,15 +153,19 @@ def end_of_generation_condition( else: tokenizer = self.model.tokenizer conditions = [] + end_tokens = set() + end_tokens.add(eod_id) + for end_string in end_strings: + ids_1 = tokenizer.text_to_ids(f'{end_string}') + ids_2 = tokenizer.text_to_ids('') + if len(ids_1) <= len(ids_2): + continue + token_id = ids_1[len(ids_2) :][0] + end_tokens.add(token_id) for p, token_item in zip(prev, tokens): text = tokenizer.ids_to_text(token_item.tolist()) conditions.append( - any( - [ - p.item() == eod_id if end_string == END_OF_SEQ else text.endswith(end_string) - for end_string in end_strings - ] - ) + any([text.endswith(end_string) for end_string in end_strings] + [p.item() in end_tokens]) ) return torch.tensor(conditions, dtype=torch.bool, device=tokens.device) From ff430e479d52f8e2d3a04d682f53be0372f2ac15 Mon Sep 17 00:00:00 2001 From: Zhilin Wang Date: Fri, 7 Jul 2023 12:41:18 -0700 Subject: [PATCH 087/123] add support for max_total_length=4096 for 43b (#6763) * add support for max_total_length=4096 for 43b Signed-off-by: Zhilin Wang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Zhilin Wang Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../customization_dataset_preparation.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/customization_dataset_preparation/customization_dataset_preparation.py b/tools/customization_dataset_preparation/customization_dataset_preparation.py index 071c06e20803..53582f5489f1 100644 --- a/tools/customization_dataset_preparation/customization_dataset_preparation.py +++ b/tools/customization_dataset_preparation/customization_dataset_preparation.py @@ -41,6 +41,7 @@ 1. `--drop_duplicates` : Use this flag to drop rows that are exactly the same for both prompt and completion 2. `--split_train_validation` : Use this flag to split one file into separate train and validation files. 3. `--val_proportion 0.1`: Use a float (default 0.1) between 0 and 1 to control how much of the dataset to allocate to the validation set and the remaining for the train dataset. +4. `--short_context_model`: Use this flag to prepare data for use with models that have shorter context length of 2048 tokens (e.g. 5B and 20B models) What to expect @@ -396,6 +397,12 @@ def print_all_messages(messages): parser.add_argument("--completion_template", "-ct", default="{completion}") parser.add_argument("--drop_duplicates", "-dd", action="store_true") parser.add_argument("--split_train_validation", "-stv", action="store_true") + parser.add_argument( + "--short_context_model", + "-scm", + action="store_true", + help="Specifies if using models with shorter context length of 2048 tokens e.g. 5B and 20B models", + ) parser.add_argument( "--val_proportion", "-vp", @@ -409,8 +416,13 @@ def print_all_messages(messages): messages = [] messages.append(str(args)) + if args.short_context_model: + MAX_TOKEN_LENGTH = 2048 + else: + MAX_TOKEN_LENGTH = 4096 + # every token is around 4 chars - MAX_TOTAL_CHAR_LENGTH = 4 * 2048 + MAX_TOTAL_CHAR_LENGTH = 4 * MAX_TOKEN_LENGTH df, message = load_file_into_df(args.filename) messages.append(message) From b22a21ffac1fcf1a5d3e2f0c70b6b6263c5a152a Mon Sep 17 00:00:00 2001 From: Vadim Kantorov Date: Fri, 7 Jul 2023 22:11:16 +0200 Subject: [PATCH 088/123] rnnt_greedy_decoding.py: typos? auto-repressively -> auto-regressively (#6989) Signed-off-by: Vadim Kantorov --- .../parts/submodules/rnnt_greedy_decoding.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py index 42b14fd7b8bf..ac10e54bb249 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py @@ -241,7 +241,7 @@ def _joint_step(self, enc, pred, log_normalize: Optional[bool] = None): class GreedyRNNTInfer(_GreedyRNNTInfer): """A greedy transducer decoder. - Sequence level greedy decoding, performed auto-repressively. + Sequence level greedy decoding, performed auto-regressively. Args: decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. @@ -326,7 +326,7 @@ def forward( partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, ): """Returns a list of hypotheses given an input batch of the encoder hidden embedding. - Output token is generated auto-repressively. + Output token is generated auto-regressively. Args: encoder_output: A tensor of size (batch, features, timesteps). @@ -479,7 +479,7 @@ def _greedy_decode( class GreedyBatchedRNNTInfer(_GreedyRNNTInfer): """A batch level greedy transducer decoder. - Batch level greedy decoding, performed auto-repressively. + Batch level greedy decoding, performed auto-regressively. Args: decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. @@ -571,7 +571,7 @@ def forward( partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, ): """Returns a list of hypotheses given an input batch of the encoder hidden embedding. - Output token is generated auto-repressively. + Output token is generated auto-regressively. Args: encoder_output: A tensor of size (batch, features, timesteps). @@ -1034,7 +1034,7 @@ def __init__(self, encoder_model: str, decoder_joint_model: str, max_symbols_per def __call__(self, audio_signal: torch.Tensor, length: torch.Tensor): """Returns a list of hypotheses given an input batch of the encoder hidden embedding. - Output token is generated auto-repressively. + Output token is generated auto-regressively. Args: encoder_output: A tensor of size (batch, features, timesteps). @@ -1455,7 +1455,7 @@ def _get_initial_states(self, batchsize): class GreedyMultiblankRNNTInfer(GreedyRNNTInfer): """A greedy transducer decoder for multi-blank RNN-T. - Sequence level greedy decoding, performed auto-repressively. + Sequence level greedy decoding, performed auto-regressively. Args: decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. @@ -1655,7 +1655,7 @@ def _greedy_decode( class GreedyBatchedMultiblankRNNTInfer(GreedyBatchedRNNTInfer): """A batch level greedy transducer decoder. - Batch level greedy decoding, performed auto-repressively. + Batch level greedy decoding, performed auto-regressively. Args: decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. joint_model: rnnt_utils.AbstractRNNTJoint implementation. @@ -2207,7 +2207,7 @@ class GreedyBatchedRNNTInferConfig: class GreedyTDTInfer(_GreedyRNNTInfer): """A greedy TDT decoder. - Sequence level greedy decoding, performed auto-repressively. + Sequence level greedy decoding, performed auto-regressively. Args: decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. @@ -2289,7 +2289,7 @@ def forward( partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, ): """Returns a list of hypotheses given an input batch of the encoder hidden embedding. - Output token is generated auto-repressively. + Output token is generated auto-regressively. Args: encoder_output: A tensor of size (batch, features, timesteps). encoded_lengths: list of int representing the length of each sequence @@ -2459,7 +2459,7 @@ def _greedy_decode( class GreedyBatchedTDTInfer(_GreedyRNNTInfer): """A batch level greedy TDT decoder. - Batch level greedy decoding, performed auto-repressively. + Batch level greedy decoding, performed auto-regressively. Args: decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. joint_model: rnnt_utils.AbstractRNNTJoint implementation. @@ -2547,7 +2547,7 @@ def forward( partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, ): """Returns a list of hypotheses given an input batch of the encoder hidden embedding. - Output token is generated auto-repressively. + Output token is generated auto-regressively. Args: encoder_output: A tensor of size (batch, features, timesteps). encoded_lengths: list of int representing the length of each sequence From 07b79d2061cbcf98f2f0bc9106f810610183fc96 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 7 Jul 2023 14:51:30 -0700 Subject: [PATCH 089/123] Cache handling without input tensors mutation (#6980) (#6996) * Cache handling without input tensors mutation * Cleanup * Cleanup#2 * Cleanup#3 --------- Signed-off-by: Boris Fomitchev Co-authored-by: Boris Fomitchev Co-authored-by: Somshubra Majumdar --- nemo/collections/asr/models/asr_model.py | 64 ++++++----------- .../asr/modules/conformer_encoder.py | 48 ++++++------- .../multi_head_attention_adapter_module.py | 16 ++--- .../asr/parts/submodules/causal_convs.py | 28 ++++---- .../asr/parts/submodules/conformer_modules.py | 70 +++++++------------ .../parts/submodules/multi_head_attention.py | 53 ++++++++------ 6 files changed, 118 insertions(+), 161 deletions(-) diff --git a/nemo/collections/asr/models/asr_model.py b/nemo/collections/asr/models/asr_model.py index c0f4c1cd0a70..6ac3633201e2 100644 --- a/nemo/collections/asr/models/asr_model.py +++ b/nemo/collections/asr/models/asr_model.py @@ -161,7 +161,7 @@ def output_module(self): @property def output_names(self): otypes = self.output_module.output_types - if hasattr(self.input_module, 'export_cache_support') and self.input_module.export_cache_support: + if getattr(self.input_module, 'export_cache_support', False): in_types = self.input_module.output_types otypes = {n: t for (n, t) in list(otypes.items())[:1]} for (n, t) in list(in_types.items())[1:]: @@ -174,7 +174,6 @@ def forward_for_export( """ This forward is used when we need to export the model to ONNX format. Inputs cache_last_channel and cache_last_time are needed to be passed for exporting streaming models. - When they are passed, it just passes the inputs through the encoder part and currently the ONNX conversion does not fully work for this case. Args: input: Tensor that represents a batch of raw audio signals, of shape [B, T]. T here represents timesteps. @@ -187,49 +186,26 @@ def forward_for_export( Returns: the output of the model """ - if hasattr(self.input_module, 'forward_for_export'): - if cache_last_channel is None and cache_last_time is None: - encoder_output = self.input_module.forward_for_export(audio_signal=input, length=length) - else: - encoder_output = self.input_module.forward_for_export( - audio_signal=input, - length=length, - cache_last_channel=cache_last_channel, - cache_last_time=cache_last_time, - cache_last_channel_len=cache_last_channel_len, - ) + enc_fun = getattr(self.input_module, 'forward_for_export', self.input_module.forward) + if cache_last_channel is None: + encoder_output = enc_fun(audio_signal=input, length=length) + if isinstance(encoder_output, tuple): + encoder_output = encoder_output[0] else: - if cache_last_channel is None and cache_last_time is None: - encoder_output = self.input_module(audio_signal=input, length=length) - else: - encoder_output = self.input_module( - audio_signal=input, - length=length, - cache_last_channel=cache_last_channel, - cache_last_time=cache_last_time, - cache_last_channel_len=cache_last_channel_len, - ) - if isinstance(encoder_output, tuple): - decoder_input = encoder_output[0] - else: - decoder_input = encoder_output - if hasattr(self.output_module, 'forward_for_export'): - if cache_last_channel is None and cache_last_time is None: - ret = self.output_module.forward_for_export(encoder_output=decoder_input) - else: - ret = self.output_module.forward_for_export(encoder_output=decoder_input) - else: - if cache_last_channel is None and cache_last_time is None: - ret = self.output_module(encoder_output=decoder_input) - else: - ret = self.output_module(encoder_output=decoder_input) - if cache_last_channel is None and cache_last_time is None: - pass - else: - if isinstance(ret, tuple): - ret = (ret[0], encoder_output[1], encoder_output[2], encoder_output[3], encoder_output[4]) - else: - ret = (ret, encoder_output[1], encoder_output[2], encoder_output[3], encoder_output[4]) + encoder_output, length, cache_last_channel, cache_last_time, cache_last_channel_len = enc_fun( + audio_signal=input, + length=length, + cache_last_channel=cache_last_channel, + cache_last_time=cache_last_time, + cache_last_channel_len=cache_last_channel_len, + ) + + dec_fun = getattr(self.output_module, 'forward_for_export', self.output_module.forward) + ret = dec_fun(encoder_output=encoder_output) + if isinstance(ret, tuple): + ret = ret[0] + if cache_last_channel is not None: + ret = (ret, length, cache_last_channel, cache_last_time, cache_last_channel_len) return cast_all(ret, from_dtype=torch.float16, to_dtype=torch.float32) @property diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py index 74c255741039..8f429c25806d 100644 --- a/nemo/collections/asr/modules/conformer_encoder.py +++ b/nemo/collections/asr/modules/conformer_encoder.py @@ -505,11 +505,6 @@ def forward_internal( (audio_signal.size(0),), audio_signal.size(-1), dtype=torch.int64, device=audio_signal.device ) - if cache_last_time is not None: - cache_last_time_next = torch.zeros_like(cache_last_time) - else: - cache_last_time_next = None - # select a random att_context_size with the distribution specified by att_context_probs during training # for non-validation cases like test, validation or inference, it uses the first mode in self.att_context_size if self.training and len(self.att_context_size_all) > 1: @@ -536,7 +531,6 @@ def forward_internal( if cache_last_channel is not None: cache_len = self.streaming_cfg.last_channel_cache_size cache_keep_size = max_audio_length - self.streaming_cfg.cache_drop_size - cache_last_channel_next = torch.zeros_like(cache_last_channel) max_audio_length = max_audio_length + cache_len padding_length = length + cache_len offset = torch.neg(cache_last_channel_len) + cache_len @@ -561,19 +555,32 @@ def forward_internal( pad_mask = pad_mask[:, cache_len:] if att_mask is not None: att_mask = att_mask[:, cache_len:] + # Convert caches from the tensor to list + cache_last_time_next = [] + cache_last_channel_next = [] for lth, (drop_prob, layer) in enumerate(zip(self.layer_drop_probs, self.layers)): original_signal = audio_signal + if cache_last_channel is not None: + cache_last_channel_cur = cache_last_channel[lth] + cache_last_time_cur = cache_last_time[lth] + else: + cache_last_channel_cur = None + cache_last_time_cur = None audio_signal = layer( x=audio_signal, att_mask=att_mask, pos_emb=pos_emb, pad_mask=pad_mask, - cache_last_channel=cache_last_channel, - cache_last_time=cache_last_time, - cache_last_channel_next=cache_last_channel_next, - cache_last_time_next=cache_last_time_next, + cache_last_channel=cache_last_channel_cur, + cache_last_time=cache_last_time_cur, ) + + if cache_last_channel_cur is not None: + (audio_signal, cache_last_channel_cur, cache_last_time_cur) = audio_signal + cache_last_channel_next.append(cache_last_channel_cur) + cache_last_time_next.append(cache_last_time_cur) + # applying stochastic depth logic from https://arxiv.org/abs/2102.03216 if self.training and drop_prob > 0.0: should_drop = torch.rand(1) < drop_prob @@ -626,6 +633,8 @@ def forward_internal( length = length.to(dtype=torch.int64) if cache_last_channel is not None: + cache_last_channel_next = torch.stack(cache_last_channel_next, dim=0) + cache_last_time_next = torch.stack(cache_last_time_next, dim=0) return ( audio_signal, length, @@ -860,20 +869,12 @@ def setup_streaming_params( else: streaming_cfg.drop_extra_pre_encoded = streaming_cfg.pre_encode_cache_size // self.subsampling_factor - # counting the number of the layers need caching - streaming_cfg.last_channel_num = 0 - streaming_cfg.last_time_num = 0 for m in self.layers.modules(): if hasattr(m, "_max_cache_len"): if isinstance(m, MultiHeadAttention): - m._cache_id = streaming_cfg.last_channel_num m.cache_drop_size = streaming_cfg.cache_drop_size - streaming_cfg.last_channel_num += 1 - if isinstance(m, CausalConv1D): - m._cache_id = streaming_cfg.last_time_num m.cache_drop_size = streaming_cfg.cache_drop_size - streaming_cfg.last_time_num += 1 self.streaming_cfg = streaming_cfg @@ -886,19 +887,12 @@ def get_initial_cache_state(self, batch_size=1, dtype=torch.float32, device=None create_tensor = torch.zeros last_time_cache_size = self.conv_context_size[0] cache_last_channel = create_tensor( - ( - self.streaming_cfg.last_channel_num, - batch_size, - self.streaming_cfg.last_channel_cache_size, - self.d_model, - ), + (len(self.layers), batch_size, self.streaming_cfg.last_channel_cache_size, self.d_model,), device=device, dtype=dtype, ) cache_last_time = create_tensor( - (self.streaming_cfg.last_time_num, batch_size, self.d_model, last_time_cache_size), - device=device, - dtype=dtype, + (len(self.layers), batch_size, self.d_model, last_time_cache_size), device=device, dtype=dtype, ) if max_dim > 0: cache_last_channel_len = torch.randint( diff --git a/nemo/collections/asr/parts/submodules/adapters/multi_head_attention_adapter_module.py b/nemo/collections/asr/parts/submodules/adapters/multi_head_attention_adapter_module.py index 169dde48602f..563d4219baa7 100644 --- a/nemo/collections/asr/parts/submodules/adapters/multi_head_attention_adapter_module.py +++ b/nemo/collections/asr/parts/submodules/adapters/multi_head_attention_adapter_module.py @@ -147,18 +147,18 @@ def __init__( # reset parameters for Q to be identity operation self.reset_parameters() - def forward(self, query, key, value, mask, pos_emb=None, cache=None, cache_next=None): + def forward(self, query, key, value, mask, pos_emb=None, cache=None): """Compute 'Scaled Dot Product Attention'. Args: query (torch.Tensor): (batch, time1, size) key (torch.Tensor): (batch, time2, size) value(torch.Tensor): (batch, time2, size) mask (torch.Tensor): (batch, time1, time2) - cache (torch.Tensor) : (cache_nums, batch, time_cache, size) - cache_next (torch.Tensor) : (cache_nums, batch, time_cache_next, size) + cache (torch.Tensor) : (batch, time_cache, size) returns: output (torch.Tensor): transformed `value` (batch, time1, d_model) weighted by the query dot key attention + cache (torch.Tensor) : (batch, time_cache_next, size) """ # Need to perform duplicate computations as at this point the tensors have been # separated by the adapter forward @@ -166,7 +166,7 @@ def forward(self, query, key, value, mask, pos_emb=None, cache=None, cache_next= key = self.pre_norm(key) value = self.pre_norm(value) - return super().forward(query, key, value, mask, pos_emb, cache=cache, cache_next=cache_next) + return super().forward(query, key, value, mask, pos_emb, cache=cache) def reset_parameters(self): with torch.no_grad(): @@ -242,7 +242,7 @@ def __init__( # reset parameters for Q to be identity operation self.reset_parameters() - def forward(self, query, key, value, mask, pos_emb, cache=None, cache_next=None): + def forward(self, query, key, value, mask, pos_emb, cache=None): """Compute 'Scaled Dot Product Attention' with rel. positional encoding. Args: query (torch.Tensor): (batch, time1, size) @@ -250,10 +250,10 @@ def forward(self, query, key, value, mask, pos_emb, cache=None, cache_next=None) value(torch.Tensor): (batch, time2, size) mask (torch.Tensor): (batch, time1, time2) pos_emb (torch.Tensor) : (batch, time1, size) - cache (torch.Tensor) : (cache_nums, batch, time_cache, size) - cache_next (torch.Tensor) : (cache_nums, batch, time_cache_next, size) + cache (torch.Tensor) : (batch, time_cache, size) Returns: output (torch.Tensor): transformed `value` (batch, time1, d_model) weighted by the query dot key attention + cache_next (torch.Tensor) : (batch, time_cache_next, size) """ # Need to perform duplicate computations as at this point the tensors have been # separated by the adapter forward @@ -261,7 +261,7 @@ def forward(self, query, key, value, mask, pos_emb, cache=None, cache_next=None) key = self.pre_norm(key) value = self.pre_norm(value) - return super().forward(query, key, value, mask, pos_emb, cache=cache, cache_next=cache_next) + return super().forward(query, key, value, mask, pos_emb, cache=cache) def reset_parameters(self): with torch.no_grad(): diff --git a/nemo/collections/asr/parts/submodules/causal_convs.py b/nemo/collections/asr/parts/submodules/causal_convs.py index 25f841802154..c6251690b1b1 100644 --- a/nemo/collections/asr/parts/submodules/causal_convs.py +++ b/nemo/collections/asr/parts/submodules/causal_convs.py @@ -45,7 +45,6 @@ def __init__( raise ValueError("Argument padding should be set to None for CausalConv2D.") self._left_padding = kernel_size - 1 self._right_padding = stride - 1 - self._cache_id = None padding = 0 super(CausalConv2D, self).__init__( @@ -113,7 +112,6 @@ def __init__( raise ValueError(f"Invalid padding param: {padding}!") self._max_cache_len = self._left_padding - self._cache_id = None super(CausalConv1D, self).__init__( in_channels=in_channels, @@ -129,21 +127,21 @@ def __init__( dtype=dtype, ) - def update_cache(self, x, cache=None, cache_next=None): + def update_cache(self, x, cache=None): if cache is None: new_x = F.pad(x, pad=(self._left_padding, self._right_padding)) else: new_x = F.pad(x, pad=(0, self._right_padding)) - new_x = torch.cat([cache[self._cache_id], new_x], dim=-1) - # todo: we should know input_x.size(-1) at config time - if cache_next is not None: - cache_keep_size = torch.tensor(x.size(-1) - self.cache_drop_size, dtype=torch.int64, device=x.device) - cache_keep_size = torch.clip(cache_keep_size, min=1, max=cache_next.size(-1)) - cache_next[self._cache_id, :, :, :-cache_keep_size] = cache[self._cache_id, :, :, cache_keep_size:] - cache_next[self._cache_id, :, :, -cache_keep_size:] = x[:, :, :cache_keep_size] - return new_x - - def forward(self, x, cache=None, cache_next=None): - x = self.update_cache(x, cache=cache, cache_next=cache_next) + new_x = torch.cat([cache, new_x], dim=-1) + if self.cache_drop_size > 0: + x = x[:, :, : -self.cache_drop_size] + cache = torch.cat([cache[:, :, x.size(-1) :], x], dim=-1) + return new_x, cache + + def forward(self, x, cache=None): + x, cache = self.update_cache(x, cache=cache) x = super().forward(x) - return x + if cache is None: + return x + else: + return x, cache diff --git a/nemo/collections/asr/parts/submodules/conformer_modules.py b/nemo/collections/asr/parts/submodules/conformer_modules.py index 579b78a8f5a8..677d2acd9f2e 100644 --- a/nemo/collections/asr/parts/submodules/conformer_modules.py +++ b/nemo/collections/asr/parts/submodules/conformer_modules.py @@ -138,29 +138,19 @@ def __init__( self.dropout = nn.Dropout(dropout) self.norm_out = LayerNorm(d_model) - def forward( - self, - x, - att_mask=None, - pos_emb=None, - pad_mask=None, - cache_last_channel=None, - cache_last_time=None, - cache_last_channel_next=None, - cache_last_time_next=None, - ): + def forward(self, x, att_mask=None, pos_emb=None, pad_mask=None, cache_last_channel=None, cache_last_time=None): """ Args: x (torch.Tensor): input signals (B, T, d_model) att_mask (torch.Tensor): attention masks(B, T, T) pos_emb (torch.Tensor): (L, 1, d_model) pad_mask (torch.tensor): padding mask - cache_last_channel (torch.tensor) : cache for MHA layers (N, B, T_cache, d_model) - cache_last_time (torch.tensor) : cache for convolutional layers (N, B, d_model, T_cache) - cache_last_channel_next (torch.tensor) : next cache for MHA layers (N, B, T_cache, d_model) - cache_last_time_next (torch.tensor) : next cache for convolutional layers (N, B, d_model, T_cache) + cache_last_channel (torch.tensor) : cache for MHA layers (B, T_cache, d_model) + cache_last_time (torch.tensor) : cache for convolutional layers (B, d_model, T_cache) Returns: x (torch.Tensor): (B, T, d_model) + cache_last_channel (torch.tensor) : next cache for MHA layers (B, T_cache, d_model) + cache_last_time (torch.tensor) : next cache for convolutional layers (B, d_model, T_cache) """ residual = x x = self.norm_feed_forward1(x) @@ -169,31 +159,17 @@ def forward( x = self.norm_self_att(residual) if self.self_attention_model == 'rel_pos': - x = self.self_attn( - query=x, - key=x, - value=x, - mask=att_mask, - pos_emb=pos_emb, - cache=cache_last_channel, - cache_next=cache_last_channel_next, - ) + x = self.self_attn(query=x, key=x, value=x, mask=att_mask, pos_emb=pos_emb, cache=cache_last_channel) elif self.self_attention_model == 'rel_pos_local_attn': - x = self.self_attn( - query=x, - key=x, - value=x, - pad_mask=pad_mask, - pos_emb=pos_emb, - cache=cache_last_channel, - cache_next=cache_last_channel_next, - ) + x = self.self_attn(query=x, key=x, value=x, pad_mask=pad_mask, pos_emb=pos_emb, cache=cache_last_channel) elif self.self_attention_model == 'abs_pos': - x = self.self_attn( - query=x, key=x, value=x, mask=att_mask, cache=cache_last_channel, cache_next=cache_last_channel_next - ) + x = self.self_attn(query=x, key=x, value=x, mask=att_mask, cache=cache_last_channel) else: x = None + + if x is not None and cache_last_channel is not None: + (x, cache_last_channel) = x + residual = residual + self.dropout(x) if self.is_adapter_available(): @@ -208,7 +184,9 @@ def forward( residual = pack_ip['x'] x = self.norm_conv(residual) - x = self.conv(x, pad_mask=pad_mask, cache=cache_last_time, cache_next=cache_last_time_next) + x = self.conv(x, pad_mask=pad_mask, cache=cache_last_time) + if cache_last_time is not None: + (x, cache_last_time) = x residual = residual + self.dropout(x) x = self.norm_feed_forward2(residual) @@ -228,8 +206,10 @@ def forward( if self.is_access_enabled() and self.access_cfg.get('save_encoder_tensors', False): self.register_accessible_tensor(name='encoder', tensor=x) - - return x + if cache_last_channel is None: + return x + else: + return x, cache_last_channel, cache_last_time def forward_single_enabled_adapter_( self, @@ -355,7 +335,7 @@ def __init__( in_channels=dw_conv_input_dim, out_channels=d_model, kernel_size=1, stride=1, padding=0, bias=True ) - def forward(self, x, pad_mask=None, cache=None, cache_next=None): + def forward(self, x, pad_mask=None, cache=None): x = x.transpose(1, 2) x = self.pointwise_conv1(x) @@ -368,10 +348,9 @@ def forward(self, x, pad_mask=None, cache=None, cache_next=None): if pad_mask is not None: x = x.float().masked_fill(pad_mask.unsqueeze(1), 0.0) + x = self.depthwise_conv(x, cache=cache) if cache is not None: - x = self.depthwise_conv(x, cache=cache, cache_next=cache_next) - else: - x = self.depthwise_conv(x) + x, cache = x if self.norm_type == "layer_norm": x = x.transpose(1, 2) @@ -383,7 +362,10 @@ def forward(self, x, pad_mask=None, cache=None, cache_next=None): x = self.activation(x) x = self.pointwise_conv2(x) x = x.transpose(1, 2) - return x + if cache is None: + return x + else: + return x, cache def reset_parameters_conv(self): pw1_max = pw2_max = self.d_model ** -0.5 diff --git a/nemo/collections/asr/parts/submodules/multi_head_attention.py b/nemo/collections/asr/parts/submodules/multi_head_attention.py index b7356ffe87e4..a0253524419e 100644 --- a/nemo/collections/asr/parts/submodules/multi_head_attention.py +++ b/nemo/collections/asr/parts/submodules/multi_head_attention.py @@ -73,7 +73,6 @@ def __init__(self, n_head, n_feat, dropout_rate, max_cache_len=0): self.dropout = nn.Dropout(p=dropout_rate) self._max_cache_len = max_cache_len - self._cache_id = None def forward_qkv(self, query, key, value): """Transforms query, key and value. @@ -119,20 +118,20 @@ def forward_attention(self, value, scores, mask): return self.linear_out(x) # (batch, time1, d_model) - def forward(self, query, key, value, mask, pos_emb=None, cache=None, cache_next=None): + def forward(self, query, key, value, mask, pos_emb=None, cache=None): """Compute 'Scaled Dot Product Attention'. Args: query (torch.Tensor): (batch, time1, size) key (torch.Tensor): (batch, time2, size) value(torch.Tensor): (batch, time2, size) mask (torch.Tensor): (batch, time1, time2) - cache (torch.Tensor) : (cache_nums, batch, time_cache, size) - cache_next (torch.Tensor) : (cache_nums, batch, time_cache_next, size) + cache (torch.Tensor) : (batch, time_cache, size) returns: output (torch.Tensor): transformed `value` (batch, time1, d_model) weighted by the query dot key attention + cache (torch.Tensor) : (batch, time_cache_next, size) """ - key, value, query = self.update_cache(key=key, value=value, query=query, cache=cache, cache_next=cache_next) + key, value, query, cache = self.update_cache(key=key, value=value, query=query, cache=cache) if torch.is_autocast_enabled(): query, key, value = query.to(torch.float32), key.to(torch.float32), value.to(torch.float32) @@ -142,17 +141,17 @@ def forward(self, query, key, value, mask, pos_emb=None, cache=None, cache_next= q, k, v = self.forward_qkv(query, key, value) scores = torch.matmul(q, k.transpose(-2, -1)) / self.s_d_k out = self.forward_attention(v, scores, mask) + if cache is None: + return out + else: + return out, cache - return out - - def update_cache(self, key, value, query, cache, cache_next): + def update_cache(self, key, value, query, cache): if cache is not None: - key = value = torch.cat([cache[self._cache_id], key], dim=1) + key = value = torch.cat([cache, key], dim=1) q_keep_size = query.shape[1] - self.cache_drop_size - if cache_next is not None: - cache_next[self._cache_id, :, :-q_keep_size, :] = cache[self._cache_id, :, q_keep_size:, :] - cache_next[self._cache_id, :, -q_keep_size:, :] = query[:, :q_keep_size, :] - return key, value, query + cache = torch.cat([cache[:, q_keep_size:, :], query[:, :q_keep_size, :]], dim=1) + return key, value, query, cache class RelPositionMultiHeadAttention(MultiHeadAttention): @@ -195,7 +194,7 @@ def rel_shift(self, x): x = x[:, :, 1:].view(b, h, qlen, pos_len) # (b, h, t1, t2) return x - def forward(self, query, key, value, mask, pos_emb, cache=None, cache_next=None): + def forward(self, query, key, value, mask, pos_emb, cache=None): """Compute 'Scaled Dot Product Attention' with rel. positional encoding. Args: query (torch.Tensor): (batch, time1, size) @@ -203,12 +202,13 @@ def forward(self, query, key, value, mask, pos_emb, cache=None, cache_next=None) value(torch.Tensor): (batch, time2, size) mask (torch.Tensor): (batch, time1, time2) pos_emb (torch.Tensor) : (batch, time1, size) - cache (torch.Tensor) : (cache_nums, batch, time_cache, size) - cache_next (torch.Tensor) : (cache_nums, batch, time_cache_next, size) + cache (torch.Tensor) : (batch, time_cache, size) + Returns: output (torch.Tensor): transformed `value` (batch, time1, d_model) weighted by the query dot key attention + cache (torch.Tensor) : (batch, time_cache_next, size) """ - key, value, query = self.update_cache(key=key, value=value, query=query, cache=cache, cache_next=cache_next) + key, value, query, cache = self.update_cache(key=key, value=value, query=query, cache=cache) if torch.is_autocast_enabled(): query, key, value = query.to(torch.float32), key.to(torch.float32), value.to(torch.float32) @@ -244,7 +244,10 @@ def forward(self, query, key, value, mask, pos_emb, cache=None, cache_next=None) out = self.forward_attention(v, scores, mask) - return out + if cache is None: + return out + else: + return out, cache class RelPositionMultiHeadAttentionLongformer(RelPositionMultiHeadAttention): @@ -298,7 +301,7 @@ def __init__( self.global_k = nn.Linear(n_feat, n_feat) self.global_v = nn.Linear(n_feat, n_feat) - def forward(self, query, key, value, pad_mask, pos_emb, cache=None, cache_next=None): + def forward(self, query, key, value, pad_mask, pos_emb, cache=None): """Compute Scaled Dot Product Local Attention with rel. positional encoding. using overlapping chunks Args: query (torch.Tensor): (batch, time, size) @@ -306,13 +309,13 @@ def forward(self, query, key, value, pad_mask, pos_emb, cache=None, cache_next=N value(torch.Tensor): (batch, time, size) pad_mask (torch.Tensor): (batch, time) pos_emb (torch.Tensor) : (batch, 2w + 1, size) - cache (torch.Tensor) : (cache_nums, batch, time_cache, size) - cache_next (torch.Tensor) : (cache_nums, batch, time_cache_next, size) + cache (torch.Tensor) : (batch, time_cache, size) Returns: output (torch.Tensor): transformed `value` (batch, time1, d_model) weighted by the query dot key attention + cache (torch.Tensor) : (batch, time_cache_next, size) """ - key, value, query = self.update_cache(key=key, value=value, query=query, cache=cache, cache_next=cache_next) + key, value, query, cache = self.update_cache(key=key, value=value, query=query, cache=cache) if torch.is_autocast_enabled(): query, key, value = query.to(torch.float32), key.to(torch.float32), value.to(torch.float32) @@ -453,7 +456,11 @@ def forward(self, query, key, value, pad_mask, pos_emb, cache=None, cache_next=N out[is_index_global_attn_nonzero] += out_global_to_all - return self.linear_out(out.reshape(n_batch, -1, self.h * self.d_k)[:, :T]) + ret = self.linear_out(out.reshape(n_batch, -1, self.h * self.d_k)[:, :T]) + if cache is None: + return ret + else: + return ret, cache def _get_global_attn_indices(self, is_index_global_attn: torch.Tensor) -> Tuple: """ From 94e1efaace9f761a8676a0d3cc7b54c57db736c5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 7 Jul 2023 14:52:07 -0700 Subject: [PATCH 090/123] Hybrid conformer export (#6983) (#6995) * Implemented generic kv-pair setting of export_config from args * Hybrid conformer export * Hybrid decoder export * Cleanup * Changed from **kwargs * Docstring * Docs added * Stringify args * Added docs for ASR export configs * lowercase ctc --------- Signed-off-by: Boris Fomitchev Co-authored-by: Boris Fomitchev --- docs/source/asr/models.rst | 10 ++++++ docs/source/core/export.rst | 31 +++++++++++++++++++ nemo/collections/asr/models/asr_model.py | 8 +++++ .../asr/models/hybrid_rnnt_ctc_models.py | 14 +++++++++ nemo/collections/asr/models/rnnt_models.py | 12 +++++-- nemo/core/classes/exportable.py | 14 +++++++++ scripts/export.py | 19 +++++++++--- 7 files changed, 102 insertions(+), 6 deletions(-) diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst index 80a0fd90f0fb..697a89827145 100644 --- a/docs/source/asr/models.rst +++ b/docs/source/asr/models.rst @@ -215,6 +215,11 @@ It is recommended to train a model in streaming model with limited context for t You may find FastConformer variants of cache-aware streaming models under ``/examples/asr/conf/fastconformer/``. +Note cache-aware streaming models are being exported without caching support by default. +To include caching support, `model.set_export_config({'cache_support' : 'True'})` should be called before export. +Or, if ``/scripts/export.py`` is being used: +`python export.py cache_aware_conformer.nemo cache_aware_conformer.onnx --config cache_support=True` + .. _LSTM-Transducer_model: LSTM-Transducer @@ -291,6 +296,11 @@ Similar example configs for FastConformer variants of Hybrid models can be found ``/examples/asr/conf/fastconformer/hybrid_transducer_ctc/`` ``/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/`` +Note Hybrid models are being exported as RNNT (encoder and decoder+joint parts) by default. +To export as CTC (single encoder+decoder graph), `model.set_export_config({'decoder_type' : 'ctc'})` should be called before export. +Or, if ``/scripts/export.py`` is being used: +`python export.py hybrid_transducer.nemo hybrid_transducer.onnx --config decoder_type=ctc` + .. _Conformer-HAT_model: Conformer-HAT (Hybrid Autoregressive Transducer) diff --git a/docs/source/core/export.rst b/docs/source/core/export.rst index 0e598e215dbf..f54daffe9c9c 100644 --- a/docs/source/core/export.rst +++ b/docs/source/core/export.rst @@ -177,6 +177,37 @@ Another common requirement for models that are being exported is to run certain # call base method for common set of modifications Exportable._prepare_for_export(self, **kwargs) +Some models that require control flow, need to be exported in multiple parts. Typical examples are RNNT nets. +To facilitate that, the hooks below are provided. To export, for example, 'encoder' and 'decoder' subnets of the model, overload list_export_subnets to return ['encoder', 'decoder']. + +.. code-block:: Python + + def get_export_subnet(self, subnet=None): + """ + Returns Exportable subnet model/module to export + """ + + + def list_export_subnets(self): + """ + Returns default set of subnet names exported for this model + First goes the one receiving input (input_example) + """ + +Some nertworks may be exported differently according to user-settable options (like ragged batch support for TTS or cache support for ASR). To facilitate that - `set_export_config()` method is provided by Exportable to set key/value pairs to predefined model.export_config dictionary, to be used during the export: + +.. code-block:: Python + def set_export_config(self, args): + """ + Sets/updates export_config dictionary + """ +Also, if an action hook on setting config is desired, this method may be overloaded by `Exportable` descendants to include one. +An example can be found in ``/nemo/collections/asr/models/rnnt_models.py``. + +Here is example on now `set_export_config()` call is being tied to command line arguments in ``/scripts/export.py`` : + +.. code-block:: Python + python scripts/export.py hybrid_conformer.nemo hybrid_conformer.onnx --config decoder_type=ctc Exportable Model Code ~~~~~~~~~~~~~~~~~~~~~ diff --git a/nemo/collections/asr/models/asr_model.py b/nemo/collections/asr/models/asr_model.py index 6ac3633201e2..7e03d587139f 100644 --- a/nemo/collections/asr/models/asr_model.py +++ b/nemo/collections/asr/models/asr_model.py @@ -215,3 +215,11 @@ def disabled_deployment_input_names(self): @property def disabled_deployment_output_names(self): return self.encoder.disabled_deployment_output_names + + def set_export_config(self, args): + if 'cache_support' in args: + enable = bool(args['cache_support']) + self.encoder.export_cache_support = enable + logging.info(f"Caching support enabled: {enable}") + self.encoder.setup_streaming_params() + super().set_export_config(args) diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py index 5ca6124ecfd7..11c616b1257f 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py @@ -645,6 +645,20 @@ def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): self.finalize_interctc_metrics(metrics, outputs, prefix="test_") return metrics + # EncDecRNNTModel is exported in 2 parts + def list_export_subnets(self): + if self.cur_decoder == 'rnnt': + return ['encoder', 'decoder_joint'] + else: + return ['self'] + + @property + def output_module(self): + if self.cur_decoder == 'rnnt': + return self.decoder + else: + return self.ctc_decoder + @classmethod def list_available_models(cls) -> Optional[PretrainedModelInfo]: """ diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index 92bb04fd2a3e..0c1da97c5012 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -28,7 +28,7 @@ from nemo.collections.asr.data.audio_to_text_dali import AudioToCharDALIDataset, DALIOutputs from nemo.collections.asr.losses.rnnt import RNNTLoss, resolve_rnnt_default_loss_name from nemo.collections.asr.metrics.rnnt_wer import RNNTWER, RNNTDecoding, RNNTDecodingConfig -from nemo.collections.asr.models.asr_model import ASRModel +from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel from nemo.collections.asr.modules.rnnt import RNNTDecoderJoint from nemo.collections.asr.parts.mixins import ASRModuleMixin from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType @@ -39,7 +39,7 @@ from nemo.utils import logging -class EncDecRNNTModel(ASRModel, ASRModuleMixin, Exportable): +class EncDecRNNTModel(ASRModel, ASRModuleMixin, ExportableEncDecModel): """Base class for encoder decoder RNNT-based models.""" def __init__(self, cfg: DictConfig, trainer: Trainer = None): @@ -960,6 +960,14 @@ def list_export_subnets(self): def decoder_joint(self): return RNNTDecoderJoint(self.decoder, self.joint) + def set_export_config(self, args): + if 'decoder_type' in args: + if hasattr(self, 'change_decoding_strategy'): + self.change_decoding_strategy(decoder_type=args['decoder_type']) + else: + raise Exception("Model does not have decoder type option") + super().set_export_config(args) + @classmethod def list_available_models(cls) -> List[PretrainedModelInfo]: """ diff --git a/nemo/core/classes/exportable.py b/nemo/core/classes/exportable.py index 3d2682f2304e..8469e80219d6 100644 --- a/nemo/core/classes/exportable.py +++ b/nemo/core/classes/exportable.py @@ -302,3 +302,17 @@ def list_export_subnets(self): First goes the one receiving input (input_example) """ return ['self'] + + def get_export_config(self): + """ + Returns export_config dictionary + """ + return getattr(self, 'export_config', {}) + + def set_export_config(self, args): + """ + Sets/updates export_config dictionary + """ + ex_config = self.get_export_config() + ex_config.update(args) + self.export_config = ex_config diff --git a/scripts/export.py b/scripts/export.py index fe3b79ebdf28..4b21bc4ffd73 100644 --- a/scripts/export.py +++ b/scripts/export.py @@ -62,6 +62,15 @@ def get_args(argv): ) parser.add_argument("--device", default="cuda", help="Device to export for") parser.add_argument("--check-tolerance", type=float, default=0.01, help="tolerance for verification") + parser.add_argument( + "--config", + metavar="KEY=VALUE", + nargs='+', + help="Set a number of key-value pairs to model.export_config dictionary " + "(do not put spaces before or after the = sign). " + "Note that values are always treated as strings.", + ) + args = parser.parse_args(argv) return args @@ -130,10 +139,12 @@ def nemo_export(argv): in_args["max_dim"] = args.max_dim max_dim = args.max_dim - if args.cache_support and hasattr(model, "encoder") and hasattr(model.encoder, "export_cache_support"): - model.encoder.export_cache_support = True - logging.info("Caching support is enabled.") - model.encoder.setup_streaming_params() + if args.cache_support: + model.set_export_config({"cache_support": "True"}) + + if args.config: + kv = dict(map(lambda s: s.split('='), args.config)) + model.set_export_config(kv) autocast = nullcontext if args.autocast: From 112c80607a05523b271ef58cb7f6af856d7e2df7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 10 Jul 2023 11:48:49 -0700 Subject: [PATCH 091/123] Fixing an issue with confidence ensembles (#6987) (#7004) * Bug fix for the confidence ensembles * Relax constraints for the test --------- Signed-off-by: Igor Gitman Co-authored-by: Igor Gitman --- examples/asr/transcribe_speech.py | 8 ++++++-- nemo/collections/asr/models/confidence_ensemble.py | 9 +++++---- scripts/confidence_ensembles/build_ensemble.py | 6 ++---- .../confidence_ensembles/test_confidence_ensembles.py | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index 4ed3d92a6305..401755bc8275 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -130,6 +130,8 @@ class TranscriptionConfig: # Set to True to output greedy timestamp information (only supported models) compute_timestamps: bool = False + # set to True if need to return full alignment information + preserve_alignment: bool = False # Set to True to output language ID information compute_langs: bool = False @@ -230,6 +232,8 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis # we will adjust this flag if the model does not support it compute_timestamps = cfg.compute_timestamps compute_langs = cfg.compute_langs + # has to be True if timestamps are required + preserve_alignment = True if cfg.compute_timestamps else cfg.preserve_alignment # Check whether model and decoder type match if isinstance(asr_model, EncDecCTCModel): @@ -252,7 +256,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis decoding_cfg = cfg.rnnt_decoding if cfg.decoder_type == 'rnnt' else cfg.ctc_decoding decoding_cfg.compute_timestamps = cfg.compute_timestamps # both ctc and rnnt support it if 'preserve_alignments' in decoding_cfg: - decoding_cfg.preserve_alignments = cfg.compute_timestamps + decoding_cfg.preserve_alignments = preserve_alignment if 'compute_langs' in decoding_cfg: decoding_cfg.compute_langs = cfg.compute_langs if hasattr(asr_model, 'cur_decoder'): @@ -267,7 +271,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis cfg.rnnt_decoding.compute_langs = cfg.compute_langs if 'preserve_alignments' in cfg.rnnt_decoding: - cfg.rnnt_decoding.preserve_alignments = cfg.compute_timestamps + cfg.rnnt_decoding.preserve_alignments = preserve_alignment asr_model.change_decoding_strategy(cfg.rnnt_decoding) else: diff --git a/nemo/collections/asr/models/confidence_ensemble.py b/nemo/collections/asr/models/confidence_ensemble.py index cd4738e7b97c..dd52d9a7010a 100644 --- a/nemo/collections/asr/models/confidence_ensemble.py +++ b/nemo/collections/asr/models/confidence_ensemble.py @@ -106,6 +106,11 @@ def get_filtered_logprobs(hypothesis: Hypothesis, exclude_blank: bool) -> torch. filtered_logprobs = logprobs[:1] else: filtered_logprobs = logprobs + + # need to make sure logprobs are always normalized, so checking if they sum up to 1 + if not torch.allclose(filtered_logprobs[0].exp().sum(), torch.tensor(1.0)): + filtered_logprobs = torch.log_softmax(filtered_logprobs, dim=1) + return filtered_logprobs @@ -217,10 +222,6 @@ def update_decoding_parameters(self, decoding_cfg: DictConfig): with open_dict(decoding_cfg): decoding_cfg.temperature = self.cfg.temperature decoding_cfg.preserve_alignments = True - if 'confidence_cfg' in decoding_cfg: - decoding_cfg.confidence_cfg.preserve_frame_confidence = True - else: - decoding_cfg.confidence_cfg = ConfidenceConfig(preserve_frame_confidence=True) def setup_training_data(self, train_data_config: Union[DictConfig, Dict]): """Pass-through to the ensemble models. diff --git a/scripts/confidence_ensembles/build_ensemble.py b/scripts/confidence_ensembles/build_ensemble.py index 07ceccb8b3d5..e953dec02b7a 100644 --- a/scripts/confidence_ensembles/build_ensemble.py +++ b/scripts/confidence_ensembles/build_ensemble.py @@ -458,7 +458,7 @@ def find_best_confidence( return best_conf_spec.to_confidence_config(), best_pipe -@hydra_runner(schema=BuildEnsembleConfig) +@hydra_runner(config_name="BuildEnsembleConfig", schema=BuildEnsembleConfig) def main(cfg: BuildEnsembleConfig): # silencing all messages from nemo/ptl to avoid dumping tons of configs to the stdout logging.getLogger('pytorch_lightning').setLevel(logging.CRITICAL) @@ -471,12 +471,10 @@ def main(cfg: BuildEnsembleConfig): pl.seed_everything(cfg.random_seed) cfg.transcription.random_seed = None # seed is already applied cfg.transcription.return_transcriptions = True - # that sets preserve_alignment to True - cfg.transcription.compute_timestamps = True + cfg.transcription.preserve_alignment = True cfg.transcription.ctc_decoding.temperature = cfg.temperature cfg.transcription.rnnt_decoding.temperature = cfg.temperature # this ensures that generated output is after log-softmax for consistency with CTC - cfg.transcription.rnnt_decoding.confidence_cfg.preserve_frame_confidence = True train_confidences = [] dev_confidences = [] diff --git a/scripts/confidence_ensembles/test_confidence_ensembles.py b/scripts/confidence_ensembles/test_confidence_ensembles.py index b665375c0c33..fa537529ab6b 100644 --- a/scripts/confidence_ensembles/test_confidence_ensembles.py +++ b/scripts/confidence_ensembles/test_confidence_ensembles.py @@ -113,4 +113,4 @@ def test_confidence_ensemble(tmp_path, build_args): ) results = speech_to_text_eval.main(eval_cfg) - assert results.metric_value < 0.15 # relaxed check for better than 15% WER + assert results.metric_value < 0.20 # relaxed check for better than 20% WER From 68b4d1f9e0271d41af5c0598ad1b13f2b2738323 Mon Sep 17 00:00:00 2001 From: Ryan Langman Date: Tue, 11 Jul 2023 08:55:51 -0700 Subject: [PATCH 092/123] [TTS] Add cosine distance option to TTS aligner (#6806) * [TTS] Add cosine distance option to TTS aligner Signed-off-by: Ryan * [TTS] Update aligner comments Signed-off-by: Ryan --------- Signed-off-by: Ryan --- examples/tts/conf/fastpitch/fastpitch.yaml | 2 + nemo/collections/tts/models/fastpitch.py | 16 ++-- nemo/collections/tts/modules/aligner.py | 92 +++++++++++++++++----- nemo/collections/tts/modules/submodules.py | 2 +- 4 files changed, 79 insertions(+), 33 deletions(-) diff --git a/examples/tts/conf/fastpitch/fastpitch.yaml b/examples/tts/conf/fastpitch/fastpitch.yaml index 1d552d058d76..39d5f395afbc 100644 --- a/examples/tts/conf/fastpitch/fastpitch.yaml +++ b/examples/tts/conf/fastpitch/fastpitch.yaml @@ -193,6 +193,8 @@ model: alignment_module: _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder n_text_channels: ${model.symbols_embedding_dim} + dist_type: cosine + temperature: 15.0 duration_predictor: _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index 1a68d9e51aeb..dc598a9a76d1 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -121,16 +121,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.log_images = cfg.get("log_images", False) self.log_train_images = False - loss_scale = 0.1 if self.learn_alignment else 1.0 - dur_loss_scale = loss_scale - pitch_loss_scale = loss_scale - energy_loss_scale = loss_scale - if "dur_loss_scale" in cfg: - dur_loss_scale = cfg.dur_loss_scale - if "pitch_loss_scale" in cfg: - pitch_loss_scale = cfg.pitch_loss_scale - if "energy_loss_scale" in cfg: - energy_loss_scale = cfg.energy_loss_scale + default_prosody_loss_scale = 0.1 if self.learn_alignment else 1.0 + dur_loss_scale = cfg.get("dur_loss_scale", default_prosody_loss_scale) + pitch_loss_scale = cfg.get("pitch_loss_scale", default_prosody_loss_scale) + energy_loss_scale = cfg.get("energy_loss_scale", default_prosody_loss_scale) self.mel_loss_fn = MelLoss() self.pitch_loss_fn = PitchLoss(loss_scale=pitch_loss_scale) @@ -139,7 +133,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.aligner = None if self.learn_alignment: - aligner_loss_scale = cfg.aligner_loss_scale if "aligner_loss_scale" in cfg else 1.0 + aligner_loss_scale = cfg.get("aligner_loss_scale", 1.0) self.aligner = instantiate(self._cfg.alignment_module) self.forward_sum_loss_fn = ForwardSumLoss(loss_scale=aligner_loss_scale) self.bin_loss_fn = BinLoss(loss_scale=aligner_loss_scale) diff --git a/nemo/collections/tts/modules/aligner.py b/nemo/collections/tts/modules/aligner.py index bc170742df23..2910602474fd 100644 --- a/nemo/collections/tts/modules/aligner.py +++ b/nemo/collections/tts/modules/aligner.py @@ -14,6 +14,7 @@ import torch +from einops import rearrange from torch import nn from nemo.collections.tts.modules.submodules import ConditionalInput, ConvNorm @@ -21,10 +22,27 @@ class AlignmentEncoder(torch.nn.Module): - """Module for alignment text and mel spectrogram. """ + """ + Module for alignment text and mel spectrogram. + + Args: + n_mel_channels: Dimension of mel spectrogram. + n_text_channels: Dimension of text embeddings. + n_att_channels: Dimension of model + temperature: Temperature to scale distance by. + Suggested to be 0.0005 when using dist_type "l2" and 15.0 when using "cosine". + condition_types: List of types for nemo.collections.tts.modules.submodules.ConditionalInput. + dist_type: Distance type to use for similarity measurement. Supports "l2" and "cosine" distance. + """ def __init__( - self, n_mel_channels=80, n_text_channels=512, n_att_channels=80, temperature=0.0005, condition_types=[] + self, + n_mel_channels=80, + n_text_channels=512, + n_att_channels=80, + temperature=0.0005, + condition_types=[], + dist_type="l2", ): super().__init__() self.temperature = temperature @@ -45,27 +63,60 @@ def __init__( torch.nn.ReLU(), ConvNorm(n_mel_channels, n_att_channels, kernel_size=1, bias=True), ) + if dist_type == "l2": + self.dist_fn = self.get_euclidean_dist + elif dist_type == "cosine": + self.dist_fn = self.get_cosine_dist + else: + raise ValueError(f"Unknown distance type '{dist_type}'") + + @staticmethod + def _apply_mask(inputs, mask, mask_value): + if mask is None: + return + + mask = rearrange(mask, "B T2 1 -> B 1 1 T2") + inputs.data.masked_fill_(mask, mask_value) def get_dist(self, keys, queries, mask=None): """Calculation of distance matrix. Args: - queries (torch.tensor): B x C x T1 tensor (probably going to be mel data). + queries (torch.tensor): B x C1 x T1 tensor (probably going to be mel data). keys (torch.tensor): B x C2 x T2 tensor (text data). mask (torch.tensor): B x T2 x 1 tensor, binary mask for variable length entries and also can be used for ignoring unnecessary elements from keys in the resulting distance matrix (True = mask element, False = leave unchanged). Output: dist (torch.tensor): B x T1 x T2 tensor. """ - keys_enc = self.key_proj(keys) # B x n_attn_dims x T2 - queries_enc = self.query_proj(queries) # B x n_attn_dims x T1 - attn = (queries_enc[:, :, :, None] - keys_enc[:, :, None]) ** 2 # B x n_attn_dims x T1 x T2 - dist = attn.sum(1, keepdim=True) # B x 1 x T1 x T2 + # B x C x T1 + queries_enc = self.query_proj(queries) + # B x C x T2 + keys_enc = self.key_proj(keys) + # B x 1 x T1 x T2 + dist = self.dist_fn(queries_enc=queries_enc, keys_enc=keys_enc) + + self._apply_mask(dist, mask, float("inf")) - if mask is not None: - dist.data.masked_fill_(mask.permute(0, 2, 1).unsqueeze(2), float("inf")) + return dist - return dist.squeeze(1) + @staticmethod + def get_euclidean_dist(queries_enc, keys_enc): + queries_enc = rearrange(queries_enc, "B C T1 -> B C T1 1") + keys_enc = rearrange(keys_enc, "B C T2 -> B C 1 T2") + # B x C x T1 x T2 + distance = (queries_enc - keys_enc) ** 2 + # B x 1 x T1 x T2 + l2_dist = distance.sum(axis=1, keepdim=True) + return l2_dist + + @staticmethod + def get_cosine_dist(queries_enc, keys_enc): + queries_enc = rearrange(queries_enc, "B C T1 -> B C T1 1") + keys_enc = rearrange(keys_enc, "B C T2 -> B C 1 T2") + cosine_dist = -torch.nn.functional.cosine_similarity(queries_enc, keys_enc, dim=1) + cosine_dist = rearrange(cosine_dist, "B T1 T2 -> B 1 T1 T2") + return cosine_dist @staticmethod def get_durations(attn_soft, text_len, spect_len): @@ -96,8 +147,7 @@ def get_mean_dist_by_durations(dist, durations, mask=None): batch_size, t1_size, t2_size = dist.size() assert torch.all(torch.eq(durations.sum(dim=1), t1_size)) - if mask is not None: - dist = dist.masked_fill(mask.permute(0, 2, 1).unsqueeze(2), 0) + AlignmentEncoder._apply_mask(dist, mask, 0) # TODO(oktai15): make it more efficient mean_dist_by_durations = [] @@ -149,7 +199,7 @@ def forward(self, queries, keys, mask=None, attn_prior=None, conditioning=None): """Forward pass of the aligner encoder. Args: - queries (torch.tensor): B x C x T1 tensor (probably going to be mel data). + queries (torch.tensor): B x C1 x T1 tensor (probably going to be mel data). keys (torch.tensor): B x C2 x T2 tensor (text data). mask (torch.tensor): B x T2 x 1 tensor, binary mask for variable length entries (True = mask element, False = leave unchanged). attn_prior (torch.tensor): prior for attention matrix. @@ -159,20 +209,20 @@ def forward(self, queries, keys, mask=None, attn_prior=None, conditioning=None): attn_logprob (torch.tensor): B x 1 x T1 x T2 log-prob attention mask. """ keys = self.cond_input(keys.transpose(1, 2), conditioning).transpose(1, 2) - keys_enc = self.key_proj(keys) # B x n_attn_dims x T2 - queries_enc = self.query_proj(queries) # B x n_attn_dims x T1 - - # Simplistic Gaussian Isotopic Attention - attn = (queries_enc[:, :, :, None] - keys_enc[:, :, None]) ** 2 # B x n_attn_dims x T1 x T2 - attn = -self.temperature * attn.sum(1, keepdim=True) + # B x C x T1 + queries_enc = self.query_proj(queries) + # B x C x T2 + keys_enc = self.key_proj(keys) + # B x 1 x T1 x T2 + distance = self.dist_fn(queries_enc=queries_enc, keys_enc=keys_enc) + attn = -self.temperature * distance if attn_prior is not None: attn = self.log_softmax(attn) + torch.log(attn_prior[:, None] + 1e-8) attn_logprob = attn.clone() - if mask is not None: - attn.data.masked_fill_(mask.permute(0, 2, 1).unsqueeze(2), -float("inf")) + self._apply_mask(attn, mask, -float("inf")) attn = self.softmax(attn) # softmax along T2 return attn, attn_logprob diff --git a/nemo/collections/tts/modules/submodules.py b/nemo/collections/tts/modules/submodules.py index 408ab02dead2..92218e807aac 100644 --- a/nemo/collections/tts/modules/submodules.py +++ b/nemo/collections/tts/modules/submodules.py @@ -509,7 +509,7 @@ def forward(self, inputs, conditioning=None): inputs = inputs + conditioning if "concat" in self.condition_types: - conditioning = conditionting.repeat(1, inputs.shape[1], 1) + conditioning = conditioning.repeat(1, inputs.shape[1], 1) inputs = torch.cat([inputs, conditioning]) inputs = self.concat_proj(inputs) From 0f79a9f14cb768aa2369dff97b296836ce7ade0e Mon Sep 17 00:00:00 2001 From: trias702 <25867060+trias702@users.noreply.github.com> Date: Tue, 11 Jul 2023 11:38:57 -0500 Subject: [PATCH 093/123] Minor MPT-7B fixes and creation script update (#6982) * Initial commit of minor MPT-7B fixes Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Daniel Egert Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../language_modeling/megatron_base_model.py | 1 + .../convert_mpt_7b_hf_to_nemo.py | 44 ++++++++++++++----- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index e018a4decaf6..3f541cfce14e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -221,6 +221,7 @@ def _build_tokenizer(self): merges_file=self.register_artifact("tokenizer.merge_file", self._cfg.tokenizer.get('merge_file', None)), use_fast=self.cfg.tokenizer.get('use_fast', False), delimiter=self.cfg.tokenizer.get('delimiter', None), + special_tokens=self.cfg.tokenizer.get('special_tokens', None), legacy=legacy, ) diff --git a/scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py b/scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py index 14d7b6ae54ea..fd761b6b20c2 100644 --- a/scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py @@ -34,11 +34,19 @@ TP/PP values you want: NeMo/examples/nlp/language_modeling/megatron_change_num_partitions.py +* Please note: when using the above script, you MUST also pass the `-–megatron_legacy` flag + Failure to do this will result in a corrupt model! * + +This script also requires a baseline config file from which to override default parameters. +You can specify the location of this file using the -c argument. You can use any Nemo config +file which is appropriate, but in the default case, we highly recommend you use the following: + NeMo/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml + Here is an example usage command: ```python -python scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py -i /path/to/mpt_7b -o /path/to/save +python scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py -c /path/to/megatron_gpt_config.yaml -i /path/to/mpt_7b -o /path/to/save ``` """ @@ -49,6 +57,7 @@ import pytorch_lightning as pl import torch +import yaml from omegaconf import OmegaConf from nemo.collections.nlp.models.language_modeling.megatron import GPTModel @@ -60,6 +69,9 @@ parser.add_argument( '-i', '--input', required=True, type=str, help='path to the two MPT-7B .bin weight files from HuggingFace' ) + parser.add_argument( + '-c', '--config', required=True, type=str, help='the path to the megatron_gpt_config.yaml file' + ) parser.add_argument( '-o', '--output', required=False, default=None, type=str, help='path to dir where to store output .nemo file' ) @@ -71,22 +83,37 @@ logging.critical(f'Input directory [ {args.input} ] does not exist or cannot be found. Aborting.') exit(255) - model_dict = { - 'micro_batch_size': 4, - 'global_batch_size': 8, + if not os.path.exists(args.config): + logging.critical(f'Path to config file [ {args.config} ] does not exist or cannot be found. Aborting.') + exit(255) + + with open(args.config, 'r', encoding='utf_8') as fr: + orig_cfg = yaml.safe_load(fr) + + model_dict = orig_cfg['model'] + if 'tokenizer' in model_dict: + del model_dict['tokenizer'] + if 'data' in model_dict: + del model_dict['data'] + + override_model_dict = { + 'micro_batch_size': 1, + 'global_batch_size': 4, 'rampup_batch_size': None, 'tensor_model_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'megatron_amp_O2': True, 'transformer_engine': False, - 'use_cpu_initialization': True, + 'use_cpu_initialization': False, 'hidden_size': 4096, + 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'num_attention_heads': 32, 'ffn_hidden_size': 4 * 4096, 'precision': 'bf16', + 'layernorm_epsilon': 1e-5, 'pre_process': True, 'post_process': True, 'num_tokentypes': 0, @@ -114,11 +141,6 @@ 'type': 'EleutherAI/gpt-neox-20b', 'use_fast': True, } - optim_dict = { - 'name': 'fused_adam', - 'lr': 2e-4, - 'weight_decay': 0.01, - } trainer_dict = { 'devices': 1, 'num_nodes': 1, @@ -139,8 +161,8 @@ 'enable_model_summary': False, } + model_dict.update(override_model_dict) model_dict['tokenizer'] = tokeniser_dict - model_dict['optim'] = optim_dict omega_cfg = OmegaConf.create(model_dict) From 0cca30006a40611b5b3d925b86bc226aa7942437 Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Tue, 11 Jul 2023 12:38:35 -0600 Subject: [PATCH 094/123] Change Jenkins timeout (#6997) * change timeout Signed-off-by: ericharper * change to 8 hours Signed-off-by: ericharper --------- Signed-off-by: ericharper --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index be62291daf24..766d32ebd8c4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -6,7 +6,7 @@ pipeline { } } options { - timeout(time: 2, unit: 'HOURS') + timeout(time: 8, unit: 'HOURS') disableConcurrentBuilds(abortPrevious: true) } From 9e75050e6b64a3d2aa528a7cc9acbb2861bed8d4 Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Tue, 11 Jul 2023 12:23:23 -0700 Subject: [PATCH 095/123] remove hard coded input and output fields (#7008) * remove hard coded input and output fields Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../megatron/gpt_sft_dataset.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py index 94c4b3c54c63..756494f2f315 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py @@ -147,15 +147,23 @@ def _process_example(self, example): output = example[self.label_key] if self.prompt_template is not None: - assert '{input}' in self.prompt_template - assert '{output}' in self.prompt_template + assert f'{{{self.context_key}}}' in self.prompt_template + assert f'{{{self.label_key}}}' in self.prompt_template # Make sure that '{output}' always occurs at the end of the prompt template string - assert self.prompt_template.index('{output}') == len(self.prompt_template) - len('{output}') + assert self.prompt_template.index(f'{{{self.label_key}}}') == len(self.prompt_template) - len( + f'{{{self.label_key}}}' + ) # Get the context by replacing only the input original_context = context - context = self.prompt_template.replace('{input}', context).replace('{output}', '').strip(' ') + context = ( + self.prompt_template.replace(f'{{{self.context_key}}}', context) + .replace(f'{{{self.label_key}}}', '') + .strip(' ') + ) # Replace the input and output placeholders with the actual input and output - text = self.prompt_template.replace('{input}', original_context).replace('{output}', output) + text = self.prompt_template.replace(f'{{{self.context_key}}}', original_context).replace( + f'{{{self.label_key}}}', output + ) if self.separate_prompt_and_response_with_newline and self.prompt_template is None: text = context + '\n' + output From 41d8477404dcce20667f453166a9e7adb178bef4 Mon Sep 17 00:00:00 2001 From: Sandeep Subramanian Date: Tue, 11 Jul 2023 12:56:17 -0700 Subject: [PATCH 096/123] RoPE length extrapolation with interpolation (#7005) * Push changes Signed-off-by: MaximumEntropy * Fixes Signed-off-by: MaximumEntropy * add continue training script Signed-off-by: MaximumEntropy * [WIP] nonlinear interp Signed-off-by: MaximumEntropy * Fix Signed-off-by: MaximumEntropy * override encoder_seq_len Signed-off-by: MaximumEntropy * Remove nonlinear Signed-off-by: MaximumEntropy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * sft with pi (#7006) * sft with pi Signed-off-by: Evelina * update values only if not None" Signed-off-by: Evelina --------- Signed-off-by: Evelina * Address comments Signed-off-by: MaximumEntropy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add info Signed-off-by: MaximumEntropy * Empty Signed-off-by: MaximumEntropy --------- Signed-off-by: MaximumEntropy Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> --- .../conf/megatron_gpt_config.yaml | 1 + .../megatron_gpt_continue_training.py | 193 ++++++++++++++++++ .../tuning/conf/megatron_gpt_sft.yaml | 2 + .../tuning/megatron_gpt_peft_eval.py | 4 + .../tuning/megatron_gpt_sft.py | 9 + .../language_modeling/megatron/gpt_model.py | 2 + .../language_modeling/megatron_gpt_model.py | 17 +- .../modules/common/megatron/language_model.py | 7 +- .../nlp/modules/common/megatron/module.py | 4 +- .../rotary_position_embedding.py | 17 +- 10 files changed, 249 insertions(+), 7 deletions(-) create mode 100644 examples/nlp/language_modeling/megatron_gpt_continue_training.py diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index e588e94a6720..c2b0343c2ff7 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -83,6 +83,7 @@ model: share_embeddings_and_output_weights: True # Share embedding and output layer weights. overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595. tokenizer: library: 'megatron' diff --git a/examples/nlp/language_modeling/megatron_gpt_continue_training.py b/examples/nlp/language_modeling/megatron_gpt_continue_training.py new file mode 100644 index 000000000000..e90198833595 --- /dev/null +++ b/examples/nlp/language_modeling/megatron_gpt_continue_training.py @@ -0,0 +1,193 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +from omegaconf.omegaconf import OmegaConf, open_dict +from pytorch_lightning import Trainer +from pytorch_lightning.plugins.environments import TorchElasticEnvironment +from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel +from nemo.collections.nlp.parts.nlp_overrides import ( + GradScaler, + MegatronHalfPrecisionPlugin, + NLPDDPStrategy, + NLPSaveRestoreConnector, + PipelineMixedPrecisionPlugin, +) +from nemo.core.config import hydra_runner +from nemo.utils import AppState, logging +from nemo.utils.exp_manager import exp_manager +from nemo.utils.model_utils import inject_model_parallel_rank + + +def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): + """ + This function modifies the original gpt pre-training config (t5_cfg) with attributes from the finetuning config (cfg). + The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`. + """ + OmegaConf.set_struct(gpt_cfg, True) + OmegaConf.resolve(cfg) + with open_dict(gpt_cfg): + gpt_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) + gpt_cfg.micro_batch_size = cfg.model.micro_batch_size + gpt_cfg.global_batch_size = cfg.model.global_batch_size + gpt_cfg.sequence_parallel = cfg.model.get("sequence_parallel", False) + gpt_cfg.activations_checkpoint_granularity = cfg.model.get("activations_checkpoint_granularity", None) + gpt_cfg.activations_checkpoint_num_layers = cfg.model.get("activations_checkpoint_num_layers", None) + gpt_cfg.activations_checkpoint_method = cfg.model.get("activations_checkpoint_method", None) + gpt_cfg.data = cfg.model.data + gpt_cfg.optim = cfg.model.optim + gpt_cfg.precision = cfg.trainer.precision + gpt_cfg.restore_from_path = cfg.restore_from_path + gpt_cfg.resume_from_checkpoint = cfg.model.resume_from_checkpoint + gpt_cfg.gradient_as_bucket_view = cfg.model.gradient_as_bucket_view + gpt_cfg.encoder_seq_length = cfg.model.encoder_seq_length + gpt_cfg.max_position_embeddings = cfg.model.max_position_embeddings + gpt_cfg.seq_len_interpolation_factor = cfg.model.seq_len_interpolation_factor + gpt_cfg.use_flash_attention = cfg.model.use_flash_attention + + # This is needed when modifying a hparam file directly to load `.ckpt` files. + # This is not needed to modify the cfg in `.nemo` files. + if add_cfg_to_tree: + OmegaConf.resolve(gpt_cfg) + gpt_cfg.cfg = gpt_cfg + + return gpt_cfg + + +def load_from_nemo(cls, cfg, trainer, gpt_cfg, modify_confg_fn): + gpt_cfg = modify_confg_fn(gpt_cfg, cfg, add_cfg_to_tree=False) + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(cfg.restore_from_path): + save_restore_connector.model_extracted_dir = cfg.restore_from_path + model = cls.restore_from( + restore_path=cfg.restore_from_path, + trainer=trainer, + override_config_path=gpt_cfg, + save_restore_connector=save_restore_connector, + ) + return model + + +def load_from_checkpoint_dir(cls, cfg, trainer, modify_confg_fn): + app_state = AppState() + if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1: + app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size + app_state.tensor_model_parallel_size = cfg.model.tensor_model_parallel_size + app_state.pipeline_model_parallel_size = cfg.model.pipeline_model_parallel_size + ( + app_state.tensor_model_parallel_rank, + app_state.pipeline_model_parallel_rank, + app_state.model_parallel_size, + app_state.data_parallel_size, + app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, + ) = fake_initialize_model_parallel( + world_size=app_state.model_parallel_size, + rank=trainer.global_rank, + tensor_model_parallel_size_=cfg.model.tensor_model_parallel_size, + pipeline_model_parallel_size_=cfg.model.pipeline_model_parallel_size, + pipeline_model_parallel_split_rank_=cfg.model.pipeline_model_parallel_split_rank, + ) + checkpoint_path = inject_model_parallel_rank( + os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name) + ) + hparams_file = OmegaConf.load(cfg.model.pretrained_checkpoint.hparams_file) + gpt_cfg = modify_confg_fn(hparams_file.cfg, cfg, add_cfg_to_tree=True) + with tempfile.NamedTemporaryFile(suffix='.yaml') as f: + OmegaConf.save(config=gpt_cfg, f=f.name) + model = cls.load_from_checkpoint(checkpoint_path=checkpoint_path, trainer=trainer, hparams_file=f.name,) + return model + + +def validate_checkpoint_loading_args(cfg): + if cfg.checkpoint_dir is None or not os.path.isdir(cfg.checkpoint_dir): + raise ValueError(f'Checkpoint directory {cfg.checkpoint_dir} does not exist or is not a directory.') + if cfg.checkpoint_name is None: + raise ValueError(f'Checkpoint name {cfg.checkpoint_name} is not valid.') + if cfg.hparams_file is None or not os.path.isfile(cfg.hparams_file): + raise ValueError(f'Hparams file {cfg.hparams_file} does not exist or is not a file.') + + +@hydra_runner(config_path="conf", config_name="megatron_gpt_config") +def main(cfg) -> None: + logging.info("\n\n************** Experiment configuration ***********") + logging.info(f'\n{OmegaConf.to_yaml(cfg)}') + + megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) + with_distributed_adam = cfg.model.optim.get('name', 'fused_adam') == 'distributed_fused_adam' + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True, + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) + if cfg.trainer.precision in [16, 'bf16']: + scaler = None + if cfg.trainer.precision == 16: + scaler = GradScaler( + init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), + growth_interval=cfg.model.get('native_amp_growth_interval', 1000), + hysteresis=cfg.model.get('hysteresis', 2), + ) + if megatron_amp_o2 and not with_distributed_adam: + plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) + else: + plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) + + if cfg.get('cluster_type', None) == 'BCP': + plugins.append(TorchElasticEnvironment()) + + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) + + exp_manager(trainer, cfg.exp_manager) + + # update resume from checkpoint found by exp_manager + if cfg.model.resume_from_checkpoint is not None: + resume_from_checkpoint = cfg.model.resume_from_checkpoint + else: + resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path + logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}') + + trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint) + + if cfg.restore_from_path: + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(cfg.restore_from_path): + save_restore_connector.model_extracted_dir = cfg.restore_from_path + gpt_cfg = MegatronGPTModel.restore_from( + restore_path=cfg.restore_from_path, + trainer=trainer, + return_config=True, + save_restore_connector=save_restore_connector, + ) + model = load_from_nemo(MegatronGPTModel, cfg, trainer, gpt_cfg, modify_confg_fn=_modify_config) + elif cfg.model.get("pretrained_checkpoint", None) is not None: + validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) + model = load_from_checkpoint_dir(MegatronGPTModel, cfg, trainer, gpt_cfg, modify_confg_fn=_modify_config) + else: + print(' > WARNING: No checkpoint provided. Starting from scratch.') + # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams + with open_dict(cfg): + cfg.model.precision = cfg.trainer.precision + model = MegatronGPTModel(cfg.model, trainer) + trainer.fit(model) + + +if __name__ == '__main__': + main() diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml index f8a8e6b9dbc0..0e3f0d712dd6 100644 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml @@ -60,6 +60,8 @@ model: activations_checkpoint_num_layers: null # not used with 'selective' answer_only_loss: False # not used right now gradient_as_bucket_view: False + seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value + use_flash_attention: null # if not None, will match the base model's value hidden_dropout: 0.0 attention_dropout: 0.0 diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py index fc427a60d172..ed60328fd812 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py @@ -127,6 +127,10 @@ def main(cfg) -> None: peft_model_cfg.data.test_ds = cfg.model.data.test_ds peft_model_cfg.activations_checkpoint_granularity = None peft_model_cfg.activations_checkpoint_method = None + if peft_model_cfg.get("use_flash_attention", False): + peft_model_cfg.use_flash_attention = cfg.model.use_flash_attention + if cfg.model.get("seq_len_interpolation_factor", None) is not None: + peft_model_cfg["seq_len_interpolation_factor"] = cfg.model.seq_len_interpolation_factor with open_dict(cfg): # update the config with the trained model config diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py index 0737d55cc514..eb4bd3125cd0 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py @@ -64,6 +64,15 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): sft_cls = MegatronGPTSFTModel gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}" + if cfg.model.get('use_flash_attention', None) is not None: + gpt_cfg.use_flash_attention = cfg.model.use_flash_attention + + if cfg.model.get('seq_len_interpolation_factor', None) is not None: + gpt_cfg.seq_len_interpolation_factor = cfg.model.seq_len_interpolation_factor + + sft_cls = MegatronGPTSFTModel + gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}" + # This is needed when modifying a hparam file directly to load `.ckpt` files. # This is not needed to modify the cfg in `.nemo` files. if add_cfg_to_tree: diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py index 8e28b6cab362..d70c3e06bf01 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py @@ -166,6 +166,7 @@ def __init__( use_emha=False, ub_tp_comm_overlap=False, use_flash_attention=False, + seq_len_interpolation_factor=None, ): super(GPTModel, self).__init__(share_token_embeddings=share_embeddings_and_output_weights) @@ -249,6 +250,7 @@ def __init__( use_emha=use_emha, ub_tp_comm_overlap=ub_tp_comm_overlap, use_flash_attention=use_flash_attention, + seq_len_interpolation_factor=seq_len_interpolation_factor, ) if self.share_embeddings_and_output_weights: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 44b484b28949..55c3786a3d96 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -249,10 +249,20 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): if isinstance(self.model, list): converted_model = [] for module in self.model: - converted_model.append(Float16Module(module=module, precision=cfg.precision)) + converted_model.append( + Float16Module( + module=module, + precision=cfg.precision, + share_token_embeddings=self.cfg.get('share_embeddings_and_output_weights', True), + ) + ) self.model = converted_model else: - self.model = Float16Module(module=self.model, precision=cfg.precision) + self.model = Float16Module( + module=self.model, + precision=cfg.precision, + share_token_embeddings=self.cfg.get('share_embeddings_and_output_weights', True), + ) if self.trainer.precision == 'bf16': self.autocast_dtype = torch.bfloat16 @@ -360,6 +370,7 @@ def model_provider_func(self, pre_process, post_process): ub_tp_comm_overlap=self.cfg.get('ub_tp_comm_overlap', False), use_flash_attention=self.cfg.get('use_flash_attention', False), megatron_legacy=self.cfg.get('megatron_legacy', False), + seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None), ) return model @@ -981,7 +992,7 @@ def build_pretraining_data_loader( data_parallel_size=parallel_state.get_data_parallel_world_size(), drop_last=drop_last, global_batch_size=self.cfg.global_batch_size, - rampup_batch_size=self.cfg.rampup_batch_size, + rampup_batch_size=self.cfg.get('rampup_batch_size', None), pad_samples_to_global_batch_size=pad_samples_to_global_batch_size, ) elif self.cfg.data.dataloader_type == 'cyclic': diff --git a/nemo/collections/nlp/modules/common/megatron/language_model.py b/nemo/collections/nlp/modules/common/megatron/language_model.py index 683163246379..2aa2e8a3860e 100755 --- a/nemo/collections/nlp/modules/common/megatron/language_model.py +++ b/nemo/collections/nlp/modules/common/megatron/language_model.py @@ -123,6 +123,7 @@ def get_language_model( use_emha=False, ub_tp_comm_overlap=False, use_flash_attention=False, + seq_len_interpolation_factor=None, ): """Build language model and return along with the key to save.""" @@ -200,6 +201,7 @@ def get_language_model( use_emha=use_emha, ub_tp_comm_overlap=ub_tp_comm_overlap, use_flash_attention=use_flash_attention, + seq_len_interpolation_factor=seq_len_interpolation_factor, ) # key used for checkpoints. language_model_key = 'language_model' @@ -508,6 +510,7 @@ def __init__( use_emha=False, ub_tp_comm_overlap=False, use_flash_attention=False, + seq_len_interpolation_factor=None, ): super(TransformerLanguageModel, self).__init__(share_token_embeddings=share_embeddings_and_output_weights) @@ -559,7 +562,9 @@ def __init__( assert 0 < rotary_percentage <= 1 if rotary_percentage < 1: rotary_dim = int(rotary_dim * rotary_percentage) - self.rotary_pos_emb = RotaryEmbedding(rotary_dim) + self.rotary_pos_emb = RotaryEmbedding( + rotary_dim, seq_len_interpolation_factor=seq_len_interpolation_factor + ) elif position_embedding_type == 'alibi': # TODO: If this is used for encoder-decodemax_position_embeddingsr model, implement proper logic and following diff --git a/nemo/collections/nlp/modules/common/megatron/module.py b/nemo/collections/nlp/modules/common/megatron/module.py index 22a223013fd2..0c8c811c2661 100644 --- a/nemo/collections/nlp/modules/common/megatron/module.py +++ b/nemo/collections/nlp/modules/common/megatron/module.py @@ -254,12 +254,12 @@ def float_conversion(val): class Float16Module(MegatronModule): - def __init__(self, module, precision): + def __init__(self, module, precision, share_token_embeddings=True): if not HAVE_MEGATRON_CORE: raise ImportError( "Megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) - super().__init__() + super().__init__(share_token_embeddings=share_token_embeddings) self.precision = precision if precision == 'bf16': diff --git a/nemo/collections/nlp/modules/common/megatron/position_embedding/rotary_position_embedding.py b/nemo/collections/nlp/modules/common/megatron/position_embedding/rotary_position_embedding.py index 5a8d6d7dd333..c97010ecb911 100644 --- a/nemo/collections/nlp/modules/common/megatron/position_embedding/rotary_position_embedding.py +++ b/nemo/collections/nlp/modules/common/megatron/position_embedding/rotary_position_embedding.py @@ -21,13 +21,28 @@ class RotaryEmbedding(nn.Module): - def __init__(self, dim): + """ + Implements Rotary Position Embedding from https://arxiv.org/abs/2104.09864. + """ + + def __init__(self, dim: int, seq_len_interpolation_factor: int = None): + """ + Args: + + dim (int): rotary embedding dimension + seq_len_interpolation_factor (int): if not None, discrete positions will be interpolated + by this factor via the trick in https://arxiv.org/abs/2306.15595. + """ super().__init__() + self.seq_len_interpolation_factor = seq_len_interpolation_factor inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim)) self.register_buffer('inv_freq', inv_freq) def forward(self, max_seq_len, offset=0): seq = torch.arange(max_seq_len, device=self.inv_freq.device) + offset + if self.seq_len_interpolation_factor is not None: + seq = seq.type_as(self.inv_freq) + seq *= 1 / self.seq_len_interpolation_factor freqs = einsum('i , j -> i j', seq.type_as(self.inv_freq), self.inv_freq) # first part even vector components, second part odd vector components, # 2 * dim in dimension size From e87985d0b77fcf3ab770df7aecf4e821fc2ba140 Mon Sep 17 00:00:00 2001 From: Sandeep Subramanian Date: Wed, 12 Jul 2023 01:24:20 -0700 Subject: [PATCH 097/123] add async + distopt to sft (#7018) Signed-off-by: MaximumEntropy --- .../language_modeling/megatron_gpt_sft_model.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index 9507a01d01f0..946df3da2aa5 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -296,6 +296,15 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): tensor_shape = [seq_length, get_micro_batch_size(), self.cfg.hidden_size] data_iter = get_iterator_k_split(batch, get_num_microbatches()) + # handle asynchronous grad reduction + no_sync_func = None + grad_sync_func = None + param_sync_func = None + if not forward_only and self.with_distributed_adam: + no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_o2,) + grad_sync_func = self.reduce_overlap_gradients + param_sync_func = self.sync_overlap_parameters + fwd_bwd_function = get_forward_backward_func() losses_reduced_per_micro_batch = fwd_bwd_function( @@ -309,6 +318,11 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), enable_autocast=self.enable_autocast, + no_sync_func=no_sync_func, + grad_sync_func=grad_sync_func, + param_sync_func=param_sync_func, + overlap_p2p_comm=self.cfg.get('overlap_p2p_comm', False), + batch_p2p_comm=self.cfg.get('batch_p2p_comm', True), ) # only the last stages of the pipeline return losses From 77c666f6710f0e7ce6076f023f31122f0dfe0f43 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Wed, 12 Jul 2023 09:25:23 -0700 Subject: [PATCH 098/123] Adding tutorial for confidence ensembles (#6932) * Adding the confidence ensembles tutorial. Signed-off-by: Igor Gitman * Fix issues with notebook in colab Signed-off-by: Igor Gitman * Add clarification about the last cell Signed-off-by: Igor Gitman * Move SDP installation on top of the tutorial Signed-off-by: Igor Gitman --------- Signed-off-by: Igor Gitman --- PUBLICATIONS.md | 49 +- docs/source/asr/api.rst | 5 + docs/source/asr/configs.rst | 2 +- docs/source/asr/models.rst | 36 +- docs/source/starthere/tutorials.rst | 5 +- examples/asr/transcribe_speech.py | 1 - .../asr/models/confidence_ensemble.py | 10 +- .../confidence_ensembles/build_ensemble.py | 67 ++- .../asr/test_asr_interctc_models.py | 1 - .../asr/test_confidence_ensembles.py | 180 ++++++ tutorials/asr/Confidence_Ensembles.ipynb | 517 ++++++++++++++++++ 11 files changed, 832 insertions(+), 41 deletions(-) create mode 100644 tests/collections/asr/test_confidence_ensembles.py create mode 100644 tutorials/asr/Confidence_Ensembles.ipynb diff --git a/PUBLICATIONS.md b/PUBLICATIONS.md index 365ed2773ed3..cd120efc7e7b 100644 --- a/PUBLICATIONS.md +++ b/PUBLICATIONS.md @@ -9,6 +9,7 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
2023 + * [Confidence-based Ensembles of End-to-End Speech Recognition Models](https://arxiv.org/abs/2306.15824) * [Fast Entropy-Based Methods of Word-Level Confidence Estimation for End-to-End Automatic Speech Recognition](https://ieeexplore.ieee.org/abstract/document/10022960) * [Damage Control During Domain Adaptation for Transducer Based Automatic Speech Recognition](https://ieeexplore.ieee.org/abstract/document/10023219) @@ -23,13 +24,13 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
2021 - + * [Citrinet: Closing the Gap between Non-Autoregressive and Autoregressive End-to-End Models for Automatic Speech Recognition](https://arxiv.org/abs/2104.01721) * [SPGISpeech: 5,000 hours of transcribed financial audio for fully formatted end-to-end speech recognition](https://www.isca-speech.org/archive/interspeech_2021/oneill21_interspeech.html) * [CarneliNet: Neural Mixture Model for Automatic Speech Recognition](https://arxiv.org/abs/2107.10708) * [CTC Variations Through New WFST Topologies](https://arxiv.org/abs/2110.03098) * [A Toolbox for Construction and Analysis of Speech Datasets](https://openreview.net/pdf?id=oJ0oHQtAld) - +
@@ -45,11 +46,11 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
2019 - + * [Jasper: An End-to-End Convolutional Neural Acoustic Model](https://arxiv.org/abs/1904.03288) * [QuartzNet: Deep Automatic Speech Recognition with 1D Time-Channel Separable Convolutions](https://arxiv.org/abs/1910.10261) - - + +
@@ -60,7 +61,7 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
2022 - + * [TitaNet: Neural Model for Speaker Representation with 1D Depth-Wise Separable Convolutions and Global Context](https://ieeexplore.ieee.org/abstract/document/9746806)
@@ -68,8 +69,8 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
2020 - - * [SpeakerNet: 1D Depth-wise Separable Convolutional Network for Text-Independent Speaker Recognition and Verification]( https://arxiv.org/pdf/2010.12653.pdf) + + * [SpeakerNet: 1D Depth-wise Separable Convolutional Network for Text-Independent Speaker Recognition and Verification]( https://arxiv.org/pdf/2010.12653.pdf)
@@ -79,7 +80,7 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
2022 - + * [AmberNet: A Compact End-to-End Model for Spoken Language Identification](https://arxiv.org/abs/2210.15781) * [Accidental Learners: Spoken Language Identification in Multilingual Self-Supervised Models](https://arxiv.org/abs/2211.05103) @@ -88,17 +89,17 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
2021 - + * [MarbleNet: Deep 1D Time-Channel Separable Convolutional Neural Network for Voice Activity Detection](https://ieeexplore.ieee.org/abstract/document/9414470/)
- +
2020 - + * [MatchboxNet - 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition](http://www.interspeech2020.org/index.php?m=content&c=index&a=show&catid=337&id=993) - +
@@ -108,7 +109,7 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
2022 - + * [NVIDIA NeMo Offline Speech Translation Systems for IWSLT 2022](https://aclanthology.org/2022.iwslt-1.18/)
@@ -130,7 +131,7 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
2021 - + * [BioMegatron: Larger Biomedical Domain Language Model ](https://aclanthology.org/2020.emnlp-main.379/)
@@ -157,16 +158,16 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
2021 - + * [SGD-QA: Fast Schema-Guided Dialogue State Tracking for Unseen Services](https://arxiv.org/abs/2105.08049) - +
2020 - + * [A Fast and Robust BERT-based Dialogue State Tracker for Schema-Guided Dialogue Dataset](https://arxiv.org/abs/2008.12335) - +
-------- @@ -175,19 +176,19 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I
2022 - + * [Adapter-Based Extension of Multi-Speaker Text-to-Speech Model for New Speakers](https://arxiv.org/abs/2211.00585)
2021 - + * [TalkNet: Fully-Convolutional Non-Autoregressive Speech Synthesis Model](https://www.isca-speech.org/archive/interspeech_2021/beliaev21_interspeech.html) * [TalkNet 2: Non-Autoregressive Depth-Wise Separable Convolutional Model for Speech Synthesis with Explicit Pitch and Duration Prediction](https://arxiv.org/abs/2104.08189) * [Hi-Fi Multi-Speaker English TTS Dataset](https://www.isca-speech.org/archive/pdfs/interspeech_2021/bakhturina21_interspeech.pdf) * [Mixer-TTS: non-autoregressive, fast and compact text-to-speech model conditioned on language model embeddings](https://arxiv.org/abs/2110.03584) - +
@@ -196,7 +197,7 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I # (Inverse) Text Normalization
2022 - + * [Shallow Fusion of Weighted Finite-State Transducer and Language Model for Text Normalization](https://arxiv.org/abs/2203.15917) * [Thutmose Tagger: Single-pass neural model for Inverse Text Normalization](https://arxiv.org/abs/2208.00064) @@ -207,7 +208,7 @@ Here, we list a collection of research articles that utilize the NeMo Toolkit. I * [NeMo Inverse Text Normalization: From Development to Production](https://www.isca-speech.org/archive/pdfs/interspeech_2021/zhang21ga_interspeech.pdf) * [A Unified Transformer-based Framework for Duplex Text Normalization](https://arxiv.org/pdf/2108.09889.pdf ) - +
-------- \ No newline at end of file diff --git a/docs/source/asr/api.rst b/docs/source/asr/api.rst index 1e2073798d64..1d880018fd15 100644 --- a/docs/source/asr/api.rst +++ b/docs/source/asr/api.rst @@ -39,6 +39,11 @@ Model Classes :show-inheritance: :members: from_asr_config, from_pretrained_models, save_asr_model_to, setup_training_data +.. _confidence-ensembles-api: + +.. autoclass:: nemo.collections.asr.models.confidence_ensembles.ConfidenceEnsembleModel + :show-inheritance: + :members: transcribe Modules ------- diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst index f9a4ea9970b1..d21b40e34570 100644 --- a/docs/source/asr/configs.rst +++ b/docs/source/asr/configs.rst @@ -64,7 +64,7 @@ An example ASR train and validation configuration should look similar to the fol There are two ways to test/validate on more than one manifest: - Specify a list in the `manifest_filepath` field. Results will be reported for each, the first one being used for overall loss / WER (specify `val_dl_idx` if you wish to change that). In this case, all manifests will share configuration parameters. -- Use the ds_item key and pass a list of config objects to it. This allows you to use differently configured datasets for validation, e.g. +- Use the ds_item key and pass a list of config objects to it. This allows you to use differently configured datasets for validation, e.g. .. code-block:: yaml diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst index 697a89827145..708d66307dd3 100644 --- a/docs/source/asr/models.rst +++ b/docs/source/asr/models.rst @@ -142,7 +142,7 @@ With local attention, inference is possible on audios >1 hrs (256 subsampling ch Fast Conformer models were trained using CosineAnnealing (instead of Noam) as the scheduler. -You may find the example CTC config at +You may find the example CTC config at ``/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml`` and the transducer config at ``/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml`` @@ -310,7 +310,7 @@ The main idea is to separate labels and blank score predictions, which allows to When external LM is available for inference, the internal LM can be subtracted from HAT model prediction in beamsearch decoding to improve external LM efficiency. It can be helpful in the case of text-only adaptation for new domains. -The only difference from the standard Conformer-Transducer model (RNNT) is the use of `"HATJiont" `_ +The only difference from the standard Conformer-Transducer model (RNNT) is the use of `"HATJiont" `_ class (instead of "RNNTJoint") for joint module. The all HAT logic is implemented in the "HATJiont" class. .. image:: images/hat.png @@ -353,6 +353,38 @@ For the detailed information see: * :ref:`Configs and training ` +.. _Confidence-Ensembles: + +Confidence-based Ensembles +-------------------------- + +Confidence-based ensemble is a simple way to combine multiple models into a single system by only retaining the +output of the most confident model. Below is a schematic illustration of how such ensembles work. + + .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.19.0/conf-ensembles-overview.png + :align: center + :alt: confidence-based ensembles + :scale: 50% + +For more details about this model, see the `paper `_ +or read our `tutorial `_. + +NeMo support Confidence-based Ensembles through the +:ref:`nemo.collections.asr.models.confidence_ensembles.ConfidenceEnsembleModel ` class. + +A typical workflow to create and use the ensemble is like this + +1. Run `scripts/confidence_ensembles/build_ensemble.py `_ + script to create ensemble from existing models. See the documentation inside the script for usage examples + and description of all the supported functionality. +2. The script outputs a checkpoint that combines all the models in an ensemble. It can be directly used to transcribe + speech by calling ``.trascribe()`` method or using + `examples/asr/transcribe_speech.py `_. + +Note that the ensemble cannot be modified after construction (e.g. it does not support finetuning) and only +transcribe functionality is supported (e.g., ``.forward()`` is not properly defined). + + References ---------- diff --git a/docs/source/starthere/tutorials.rst b/docs/source/starthere/tutorials.rst index 9c960053398b..e24637718690 100644 --- a/docs/source/starthere/tutorials.rst +++ b/docs/source/starthere/tutorials.rst @@ -106,6 +106,9 @@ To run a tutorial: * - ASR - Multi-lingual ASR - `Multi-lingual ASR `_ + * - ASR + - Confidence-based Ensembles + - `Confidence-based Ensembles `_ * - NLP - Using Pretrained Language Models for Downstream Tasks - `Pretrained Language Models for Downstream Tasks `_ @@ -146,7 +149,7 @@ To run a tutorial: - P-Tuning/Prompt-Tuning - `P-Tuning/Prompt-Tuning `_ * - NLP - - Synthetic Tabular Data Generation + - Synthetic Tabular Data Generation - `Synthetic Tabular Data Generation `_ * - TTS - NeMo TTS Primer diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index 401755bc8275..f97dd96ad0f3 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -269,7 +269,6 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis cfg.rnnt_decoding.fused_batch_size = -1 cfg.rnnt_decoding.compute_timestamps = cfg.compute_timestamps cfg.rnnt_decoding.compute_langs = cfg.compute_langs - if 'preserve_alignments' in cfg.rnnt_decoding: cfg.rnnt_decoding.preserve_alignments = preserve_alignment diff --git a/nemo/collections/asr/models/confidence_ensemble.py b/nemo/collections/asr/models/confidence_ensemble.py index dd52d9a7010a..9b3191c8874d 100644 --- a/nemo/collections/asr/models/confidence_ensemble.py +++ b/nemo/collections/asr/models/confidence_ensemble.py @@ -151,7 +151,11 @@ def compute_confidence(hypothesis: Hypothesis, confidence_cfg: ConfidenceConfig) class ConfidenceEnsembleModel(ModelPT): """Implementation of the confidence ensemble model. - See for details. + See https://arxiv.org/abs/2306.15824 for details. + + .. note:: + Currently this class only support `transcribe` method as it requires + full-utterance confidence scores to operate. """ def __init__( @@ -206,7 +210,7 @@ def __init__( for model_idx in range(self.num_models): model = getattr(self, f"model{model_idx}") # for now we assume users are direclty responsible for matching - # decoder type when building ensemlbe with inference type + # decoder type when building ensemble with inference type # TODO: add automatic checks for errors if isinstance(model, EncDecHybridRNNTCTCModel): self.update_decoding_parameters(model.cfg.decoding) @@ -218,7 +222,7 @@ def __init__( model.change_decoding_strategy(model.cfg.decoding) def update_decoding_parameters(self, decoding_cfg: DictConfig): - """Updating temperature/preserve_alignment/preserve_frame_confidence parameters of the config.""" + """Updating temperature/preserve_alignment parameters of the config.""" with open_dict(decoding_cfg): decoding_cfg.temperature = self.cfg.temperature decoding_cfg.preserve_alignments = True diff --git a/scripts/confidence_ensembles/build_ensemble.py b/scripts/confidence_ensembles/build_ensemble.py index e953dec02b7a..b5685c63aa25 100644 --- a/scripts/confidence_ensembles/build_ensemble.py +++ b/scripts/confidence_ensembles/build_ensemble.py @@ -11,9 +11,60 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# -# Run ``python build_ensemble.py --help`` for usage examples. -# TODO: write usage. Mention that neither train nor dev requires transcriptions + +""" +This script provides a functionality to create confidence-based ensembles +from a collection of pretrained models. + +For more details see the paper https://arxiv.org/abs/2306.15824 +or tutorial in tutorials/asr/Confidence_Ensembles.ipynb + +You would typically use this script by providing a yaml config file or overriding +default options from command line. + +Usage examples: + +1. Building an ensemble of two monolingual models with default settings (no confidence tuning). + + python build_ensemble.py --config-path=. --config-name=ensemble_config.yaml + ensemble.0.model=stt_it_conformer_ctc_large + ensemble.0.training_manifest= + ensemble.1.model=stt_es_conformer_ctc_large + ensemble.1.training_manifest= + output_path= + + You can have more than 2 models and can control transcription settings (e.g., batch size) + with ``transcription.`` parameters. + +2. If you want to get improved results, you can enable tuning of the confidence and logistic regression (LR) parameters. + E.g. + + python build_ensemble.py + + ensemble.0.dev_manifest= + ... + # IMPORTANT: see the note below if you use > 2 models! + ensemble.N.dev_manifest= + tune_confidence=True # to allow confidence tuning. LR is tuned by default + + As with any tuning, it is recommended to have reasonably large validation set for each model, + otherwise you might overfit to the validation data. + + Note that if you add additional models (> 2) you will need to modify ensemble_config.yaml + or create a new one with added models in there. While it's theoretically possible to + fully override such parameters from commandline, hydra is very unfriendly for such + use-cases, so it's strongly recommended to be creating new configs. + +3. If you want to precisely control tuning grid search, you can do that with + + python build_ensemble.py + + tune_confidence_config.confidence_type='[entropy_renui_exp,entropy_tsallis_exp]' # only tune over this set + tune_confidence_config.alpha='[0.1,0.5,1.0]' # only tune over this set + +You can check the dataclasses in this file for the full list of supported +arguments and their default values. +""" import atexit @@ -31,7 +82,7 @@ import joblib import numpy as np import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf +from omegaconf import MISSING, DictConfig, OmegaConf from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix from sklearn.pipeline import Pipeline, make_pipeline @@ -73,9 +124,9 @@ @dataclass class EnsembleConfig: # .nemo path or pretrained name - model: str + model: str = MISSING # path to the training data manifest (non-tarred) - training_manifest: str + training_manifest: str = MISSING # specify to limit the number of training samples # 100 is most likely enough, but setting higher default just in case max_training_samples: int = 1000 @@ -150,10 +201,10 @@ class TuneLogisticRegressionConfig: @dataclass class BuildEnsembleConfig: # where to save the resulting ensemble model - output_path: str + output_path: str = MISSING # each model specification - ensemble: List[EnsembleConfig] + ensemble: List[EnsembleConfig] = MISSING random_seed: int = 0 # for reproducibility diff --git a/tests/collections/asr/test_asr_interctc_models.py b/tests/collections/asr/test_asr_interctc_models.py index bad918fbc1f0..db9a4396d72d 100644 --- a/tests/collections/asr/test_asr_interctc_models.py +++ b/tests/collections/asr/test_asr_interctc_models.py @@ -86,7 +86,6 @@ class TestInterCTCLoss: ([], [0.3]), ], ) - @pytest.mark.pleasefixme def test_forward(self, model_class, encoder_config, apply_at_layers, loss_weights): preprocessor_config = {'_target_': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor'} vocabulary = [ diff --git a/tests/collections/asr/test_confidence_ensembles.py b/tests/collections/asr/test_confidence_ensembles.py new file mode 100644 index 000000000000..ad14a2a7e6ff --- /dev/null +++ b/tests/collections/asr/test_confidence_ensembles.py @@ -0,0 +1,180 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import joblib +import pytest +from omegaconf import DictConfig, ListConfig + +from nemo.collections.asr.metrics.wer import CTCDecodingConfig +from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecRNNTModel +from nemo.collections.asr.models.confidence_ensemble import ConfidenceEnsembleModel +from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceConfig, ConfidenceMethodConfig + + +def get_model_config(model_class): + preprocessor_config = {'_target_': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor'} + vocabulary = [' ', "'", 'a', 'b', 'c'] # does not matter, so keeping small + encoder_config = { + '_target_': 'nemo.collections.asr.modules.ConformerEncoder', + 'feat_in': 64, + 'n_layers': 8, + 'd_model': 4, + } + if model_class is EncDecCTCModel: + decoder_config = { + '_target_': 'nemo.collections.asr.modules.ConvASRDecoder', + 'feat_in': None, + 'num_classes': len(vocabulary), + 'vocabulary': vocabulary, + } + model_config = DictConfig( + { + 'compute_eval_loss': True, # will be ignored by the model + 'preprocessor': DictConfig(preprocessor_config), + 'encoder': DictConfig(encoder_config), + 'decoder': DictConfig(decoder_config), + } + ) + else: + decoder_config = { + '_target_': 'nemo.collections.asr.modules.RNNTDecoder', + 'prednet': {'pred_hidden': 4, 'pred_rnn_layers': 1}, + } + joint_config = { + '_target_': 'nemo.collections.asr.modules.RNNTJoint', + 'jointnet': {'joint_hidden': 4, 'activation': 'relu'}, + } + decoding_config = {'strategy': 'greedy_batch', 'greedy': {'max_symbols': 30}} + loss_config = {'loss_name': 'default', 'warprnnt_numba_kwargs': {'fastemit_lambda': 0.001}} + + model_config = DictConfig( + { + 'compute_eval_loss': True, + 'labels': ListConfig(vocabulary), + 'preprocessor': DictConfig(preprocessor_config), + 'model_defaults': DictConfig({'enc_hidden': 4, 'pred_hidden': 4}), + 'encoder': DictConfig(encoder_config), + 'decoder': DictConfig(decoder_config), + 'joint': DictConfig(joint_config), + 'decoding': DictConfig(decoding_config), + 'loss': DictConfig(loss_config), + 'optim': {'name': 'adamw'}, + 'aux_ctc': { + 'ctc_loss_weight': 0.3, + 'use_cer': False, + 'ctc_reduction': 'mean_batch', + 'decoder': { + '_target_': 'nemo.collections.asr.modules.ConvASRDecoder', + 'feat_in': None, + 'num_classes': len(vocabulary), + 'vocabulary': vocabulary, + }, + 'decoding': DictConfig(CTCDecodingConfig), + }, + } + ) + model_config['target'] = f'{model_class.__module__}.{model_class.__name__}' + + return model_config + + +class TestConfidenceEnsembles: + """Only basic tests that are very fast to run. + + There are much more extensive integration tests available in + scripts/confidence_ensembles/test_confidence_ensembles.py + """ + + @pytest.mark.unit + @pytest.mark.parametrize( + "model_class0", [EncDecCTCModel, EncDecRNNTModel, EncDecHybridRNNTCTCModel], + ) + @pytest.mark.parametrize( + "model_class1", [EncDecCTCModel, EncDecRNNTModel, EncDecHybridRNNTCTCModel], + ) + def test_model_creation_2models(self, tmp_path, model_class0, model_class1): + """Basic test to check that ensemble of 2 models can be created.""" + model_config0 = get_model_config(model_class0) + model_config1 = get_model_config(model_class1) + + # dummy pickle file for the model selection block + joblib.dump({}, tmp_path / 'dummy.pkl') + + # default confidence + confidence_config = ConfidenceConfig( + # we keep frame confidences and apply aggregation manually to get full-utterance confidence + preserve_frame_confidence=True, + exclude_blank=True, + aggregation="mean", + method_cfg=ConfidenceMethodConfig( + name="entropy", + entropy_type="renui", + temperature=0.25, # this is not really temperature, but alpha, see https://arxiv.org/abs/2212.08703 + entropy_norm="lin", + ), + ) + + # just checking that no errors are raised when creating the model + ConfidenceEnsembleModel( + cfg=DictConfig( + { + 'model_selection_block': str(tmp_path / 'dummy.pkl'), + 'confidence': confidence_config, + 'temperature': 1.0, + 'num_models': 2, + 'model0': model_config0, + 'model1': model_config1, + } + ), + trainer=None, + ) + + def test_model_creation_5models(self, tmp_path): + """Basic test to check that ensemble of 5 models can be created.""" + model_configs = [get_model_config(EncDecCTCModel) for _ in range(5)] + + # dummy pickle file for the model selection block + joblib.dump({}, tmp_path / 'dummy.pkl') + + # default confidence + confidence_config = ConfidenceConfig( + # we keep frame confidences and apply aggregation manually to get full-utterance confidence + preserve_frame_confidence=True, + exclude_blank=True, + aggregation="mean", + method_cfg=ConfidenceMethodConfig( + name="entropy", + entropy_type="renui", + temperature=0.25, # this is not really temperature, but alpha, see https://arxiv.org/abs/2212.08703 + entropy_norm="lin", + ), + ) + + # just checking that no errors are raised when creating the model + ConfidenceEnsembleModel( + cfg=DictConfig( + { + 'model_selection_block': str(tmp_path / 'dummy.pkl'), + 'confidence': confidence_config, + 'temperature': 1.0, + 'num_models': 2, + 'model0': model_configs[0], + 'model1': model_configs[1], + 'model2': model_configs[2], + 'model3': model_configs[3], + 'model4': model_configs[4], + } + ), + trainer=None, + ) diff --git a/tutorials/asr/Confidence_Ensembles.ipynb b/tutorials/asr/Confidence_Ensembles.ipynb new file mode 100644 index 000000000000..f9617c75e36a --- /dev/null +++ b/tutorials/asr/Confidence_Ensembles.ipynb @@ -0,0 +1,517 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies.\n", + "5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect\n", + "\n", + "\n", + "NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n", + "\"\"\"\n", + "import os\n", + "\n", + "# Install dependencies\n", + "!apt-get install sox libsndfile1 ffmpeg\n", + "\n", + "# setting up a workspace folder where all downloaded content will be held\n", + "# change it to whatever location is convenient and remove after you're done with this tutorial\n", + "WORKSPACE_DIR = os.path.abspath('confidence-ensembles-tutorial')\n", + "os.makedirs(WORKSPACE_DIR, exist_ok=True)\n", + "\n", + "# need to locate NeMo repository\n", + "# either provide a path to local NeMo repository with NeMo already installed or git clone\n", + "\n", + "# option #1: local path to NeMo repo with NeMo already installed\n", + "NEMO_DIR = os.path.dirname(os.path.dirname(os.path.abspath('')))\n", + "\n", + "# option #2: download NeMo repo\n", + "if 'google.colab' in str(get_ipython()) or not os.path.exists(os.path.join(NEMO_DIR, \"nemo\")):\n", + " BRANCH = \"main\"\n", + " !git clone -b $BRANCH https://github.com/NVIDIA/NeMo $WORKSPACE_DIR/NeMo\n", + " NEMO_DIR = os.path.join(WORKSPACE_DIR, 'NeMo')\n", + "\n", + "# installing nemo (from source code)\n", + "!cd $NEMO_DIR && ./reinstall.sh\n", + "\n", + "# clone SDP and install requirements\n", + "!git clone https://github.com/NVIDIA/NeMo-speech-data-processor $WORKSPACE_DIR/NeMo-speech-data-processor\n", + "!pip install -r $WORKSPACE_DIR/NeMo-speech-data-processor/requirements.txt\n", + "\n", + "\"\"\"\n", + "Remember to restart the runtime for the kernel to pick up any upgraded packages.\n", + "Alternatively, you can uncomment the exit() below to crash and restart the kernel, in the case\n", + "that you want to use the \"Run All Cells\" (or similar) option.\n", + "\"\"\"\n", + "# exit()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Confidence-based Ensembles of End-to-End ASR Models\n", + "\n", + "In this tutorial we discuss how to use confidence-based ensembles to improve different aspects of ASR models.\n", + "\n", + "We are only going to cover basics in this tutorial, so make sure to check out our [paper](https://arxiv.org/abs/2306.15824) to learn more details!\n", + "\n", + "Before we are going to learn **what** a confidence-based ensemble is, let's discuss **why** you might want to use one. A high-level motivation behind this method is that there are many \"expert\" ASR models that are publicly available. These models are often specialized to a certain language, accent or domain and might not perform well outside of it. But what if you need to cover multiple such target domains and you don't have a single model that works well on all of them? This is exactly the case when you should try confidence-based ensembles! In our paper we show two applications of this general idea:\n", + "\n", + "1. If you need to support multi-lingual ASR, but don't have a single model that covers all your languages, you basically have two choices. You can either run a separate [language-identification](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/langid_ambernet) (LID) block first to pick an ASR model from the corresponding language. Or you can run all models in parallel and use confidence to select which output to use. In the paper we show that the second method generally works better and can be even combined with LID model for the best results.\n", + "2. If you have a generic ASR model as well as a [finetuned version](https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb) that works much better on a target domain. In such a case, your finetuned model will likely degrade on the \"base\" domain. What if you need to support both cases in a single application and don't have an easy way to know which domain the input comes from? To solve this, you can use confidence ensembles to pick the right output automatically.\n", + "\n", + "Let's also briefly talk about some limitations of the confidence-based ensembles.\n", + "\n", + "1. Confidence-based ensembles are not well suited for latency-critical applications as they require a few seconds of audio to select the most confident model.\n", + "2. The runtime cost grows linearly with each added model, which limits the practically useful ensemble size.\n", + "3. Given enough compute and data, it is likely possible to build specialized models that would outperform confidence-based ensembles on most tasks.\n", + "\n", + "To sum up — if you're combining a small number of models (e.g., up to 5), can afford a few seconds of additional latency and don't have resources to build a specialized model, confidence-based ensembles might be a good fit and you should try them out! There are many ASR models that you can combine in the ensemble available in [NVIDIA NGC cloud](https://catalog.ngc.nvidia.com/models) as well as other model hubs, such as [Hugging Face](https://huggingface.co/nvidia).\n", + "\n", + "In the next few cells we will cover what a confidence-based ensemble is and some best practices of using these models. Each cell is mostly self-contained, so feel free to skip around or jump directly to the code part if you want to see usage examples right away." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What is a confidence-based ensemble?\n", + "\n", + "You're probably familiar with more traditional [ensembles of machine learning models](https://en.wikipedia.org/wiki/Ensemble_learning). Confidence ensembles are a less popular approach where we only use an output of a single model that is deemed best for the current input. A typical way to pick the \"best\" output is to select a model with the highest confidence score, which provides an estimate of how likely the output is to be correct. Here is a schematic illustration of the model.\n", + "\n", + "\"Confidence-ensemble\n", + "\n", + "As you can see, to define confidence ensemble, we need to define 3 things:\n", + "\n", + "1. Which models are part of the ensemble.\n", + "2. How do we estimate model's confidence.\n", + "3. How do we \"calibrate\" confidence values via a model selection block.\n", + "\n", + "Let's discuss each of these 3 items below.\n", + "\n", + "### Which models to use?\n", + "\n", + "A short answer — you can use any ASR models. E.g., you can combine a number of CTC models, or Transducer models, or even mix-and-match. \n", + "\n", + "A more detailed answer is that hte performance of the confidence ensemble is upper-bounded by the performance of the best model on each of the input examples. Thus you will benefit if some of your models work really well on part of the input compared to other models. This way you will get more gains compared to each separate model, and it will also make correct model identification easier.\n", + "\n", + "### How to estimate a model's confidence?\n", + "\n", + "Good news, we have a whole separate [tutorial](TBD) on this topic! You can go through it if you want to know all the details about different ways to estimate confidence of NeMo ASR models. There are different confidence measures and aggregation functions and for the absolute best performance, you will need to run a grid-search to pick the best confidence estimation way for your specific models and data.\n", + "\n", + "That being said, we found that there exist a set of confidence parameters that work pretty well on a large set of models and datsets. They are default in NeMo and so you might not need to worry about running the search. If you do want to maximize the performance by tuning the confidence parameters, you only need to add [a few extra config lines](#Building-and-evaluating-ensemble-(tuned-parameters)).\n", + "\n", + "### How to calibrate confidence values?\n", + "\n", + "Let's now talk about the \"model selection block\". First of all — you don't need to know the details to use confidence ensembles, calibration is always automatically performed when you build the model. But if you want to learn more, read on!\n", + "\n", + "First, let's discuss why we need a separate \"model selection block\" to pick the most confident model. If we had an access to the perfect confidence, which would exactly equal to the probability of the model's output being correct, we wouldn't need this block. In this idealized case we can simply take the model with the maximum confidence score. But in practice, models tend to be over- or under-confident, which means that their confidence scores need to be calibrated together to be comparable. E.g., one model might mostly produce scores from 0 to 0.8, while another model tend to produce scores from 0 to 0.5, even though they have the same average accuracy. So we want to multiply the first model's score by 1.25 and the second model's score by 2.0 to put the on the same \"scale\".\n", + "\n", + "More generally, the goal of the model selection block is to pick the right model for each input. So it needs to solve a standard classification task, where the set of all model's confidence scores is the input and the \"most confident\" model index is the output. Since this is a standard classification problem in a low-dimensional space, we found that using a logistic regression (LR) model is sufficient to solve it with a high accuracy. We assume that for each model there exist a small (e.g., 100-1000 examples) set of input utterances that the model performs the best on. E.g., if you build a multi-lingual ensemble, this set will come from the language the model is trained to recognize. We will use these samples → model correspondence as the ground-truth for training LR.\n", + "\n", + "> **_note:_** If you don't have a clear \"audio → best recognition model\" correspondence, you can still build it artificially, as long as you also have ground-truth text labels. Just take a larger set of inputs, run all models on them and compute WER. This will tell you which model works best for which audio. But note that if all your models perform very similarly, the gains from confidence ensembling will also be minimal!\n", + "\n", + "Even though logistic regression is a simple model and operates in a low-dimensional space, we found that it's still beneficial sometimes to tune its hyperparameters, especially if your input data is imbalanced (e.g., you have more ground-truth samples for some models than others). This tuning is very cheap and so will be performed automatically, as long as you [specify a validation set in the config](#Building-and-evaluating-ensemble-(tuned-parameters))." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to use confidence-based ensembles in NeMo?\n", + "\n", + "The following cells contain code examples of how to use confidence ensembles in NeMo. We will build confidence ensemble of two models - generic ASR model trained on a large set of audio and a modified version of the same model that's finetuned to recognize [Irish English accent](https://openslr.org/83/).\n", + "\n", + "To do this, we will go through the following steps:\n", + "\n", + "1. Download and process the Irish accent data using NVIDIA's [Speech Data Processor](https://github.com/NVIDIA/NeMo-speech-data-processor).\n", + "2. Finetune the [Conformer Large CTC LS model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_conformer_ctc_large_ls) on this data. All steps work exactly the same for Transducer models as well.\n", + "3. Evaluate performance of the original and finetuned models on the Irish accent data and on LibriSpeech.\n", + "4. Build a confidence-based ensemble (with default parameters) of these two models and check how it compares with each of the models.\n", + "5. Tune the confidence hyperparameters of the ensemble and check how the performance changes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Downloading and preparing Irish accent data using [Speech Data Processor](https://github.com/NVIDIA/NeMo-speech-data-processor)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# let's start by downloading and processing the Irish accent data with SDP\n", + "# Check out https://github.com/NVIDIA/NeMo-speech-data-processor to learn more details\n", + "\n", + "# run the Irish accent preparation config (will download and process data for us)\n", + "cmd = (\n", + " f\"cd {WORKSPACE_DIR}/NeMo-speech-data-processor && \"\n", + " \"python main.py --config-path=dataset_configs/english/slr83 --config-name=config.yaml \"\n", + " f\"workspace_dir={WORKSPACE_DIR}/slr83-data dialect=irish_english_male data_split={{data_split}}\"\n", + ")\n", + "for data_split in ['train', 'dev', 'test']:\n", + " print(f\"****************** Preparing Irish accent data (split={data_split}) ******************\\n\\n\")\n", + " cur_cmd = cmd.format(data_split=data_split)\n", + " !$cur_cmd\n", + " \n", + "# you can inspect https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/slr83/config.yaml\n", + "# to see what processing was done. \n", + "# You can also check the generated NeMo manifests inside 'slr83-data' folder \n", + "# that are ready for training and evaluation \n", + "\n", + "!ls $WORKSPACE_DIR/slr83-data/irish_english_male" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Finetuning the generic model on the accent data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# before running training, let's open up a tensorboard pane to see the progress\n", + "# you might need to install tensorboard and tensorboard jupyter extension if you get errors\n", + "# you can totally skip this cell, since the logs will also be streamed to stdout\n", + "%load_ext tensorboard\n", + "%tensorboard --logdir $WORKSPACE_DIR/irish_finetuning --bind_all " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# now let's finetune the generic model on this data. \n", + "# We will only run finetuning for 5 epochs (the results can be improved by running longer)\n", + "# check out https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb\n", + "# to learn more about finetuning NeMo ASR models\n", + "from omegaconf import open_dict, OmegaConf\n", + "from pytorch_lightning import Trainer\n", + "\n", + "from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE\n", + "import nemo.utils.exp_manager as exp_manager\n", + "\n", + "\n", + "# feel free to play around with parameters here (e.g., increase bs/devices to match your GPUs)\n", + "# but note that you might need to tune LR a bit to get good results\n", + "\n", + "\n", + "trainer = Trainer(\n", + " devices=1, # to have the same results on single/multi-gpu systems\n", + " max_epochs=5, # we typically want to finetune for 50-100 epochs, but 5 is enough for the tutorial\n", + " # just some reasonable defaults\n", + " accelerator='auto',\n", + " accumulate_grad_batches=1,\n", + " enable_checkpointing=False,\n", + " logger=False,\n", + " log_every_n_steps=100,\n", + ") \n", + "model = EncDecCTCModelBPE.from_pretrained(\"stt_en_conformer_ctc_large_ls\", trainer=trainer)\n", + "\n", + "# updating data/optimization to support finetuning\n", + "with open_dict(model.cfg):\n", + " # setting up data manifests and lowering batch size in case we deal with low-memory GPUs\n", + " model.cfg.train_ds.manifest_filepath = f\"{WORKSPACE_DIR}/slr83-data/irish_english_male/train_manifest.json\"\n", + " model.cfg.train_ds.batch_size = 4\n", + " model.cfg.train_ds.is_tarred = False\n", + " model.cfg.validation_ds.manifest_filepath = f\"{WORKSPACE_DIR}/slr83-data/irish_english_male/dev_manifest.json\"\n", + " model.cfg.validation_ds.batch_size = 4\n", + "\n", + " model.cfg.optim.lr = 0.02 # 100 times lower to facilitate finetuning\n", + " model.cfg.optim.sched.warmup_steps = 0 # no warmup\n", + "\n", + "# updating the model according to the new parameters\n", + "model.setup_training_data(model.cfg.train_ds)\n", + "model.setup_multiple_validation_data(model.cfg.validation_ds)\n", + "model.setup_optimization(model.cfg.optim)\n", + "\n", + "# controlling where the model is saved and asking to save best WER model\n", + "exp_manager_config = exp_manager.ExpManagerConfig(\n", + " exp_dir=f'{WORKSPACE_DIR}/irish_finetuning',\n", + " checkpoint_callback_params=exp_manager.CallbackParams(\n", + " monitor=\"val_wer\",\n", + " mode=\"min\",\n", + " always_save_nemo=True,\n", + " save_best_model=True,\n", + " ),\n", + ")\n", + "exp_manager.exp_manager(trainer, OmegaConf.structured(exp_manager_config))\n", + " \n", + "# launching finetuning\n", + "trainer.fit(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating both models to compare performance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# let's evaluate the performance of the original and finetuned models on the test set of the Irish accent data\n", + "# as well as the LibriSpeech (which is a proxy for generic ASR domain). We expect the finetuned model to be\n", + "# significantly better on the Irish data and significantly worse on the LS\n", + "\n", + "# running the script to download LibriSpeech data\n", + "os.makedirs(os.path.join(WORKSPACE_DIR, \"librispeech\"), exist_ok=True)\n", + "!cd $NEMO_DIR && python scripts/dataset_processing/get_librispeech_data.py \\\n", + " --data_root=$WORKSPACE_DIR/librispeech --data_set=test_other,dev_other\n", + "\n", + "\n", + "# running evaluation with generic model on LS. Typically will be run as a script in command line, but we want to\n", + "# capture WER numbers for display later, so let's import and run the evaluation function here\n", + "\n", + "# adding script folder to python path to be able to import it\n", + "import glob\n", + "import sys\n", + "import pandas as pd\n", + "\n", + "sys.path.insert(0, os.path.join(NEMO_DIR, \"examples\", \"asr\"))\n", + "from speech_to_text_eval import EvaluationConfig, main as run_eval\n", + "\n", + "wer_results = {\n", + " 'generic': [], # LS, Irish\n", + " 'finetuned': [],\n", + "}\n", + "\n", + "# running evaluation with generic model\n", + "eval_cfg = run_eval(EvaluationConfig(\n", + " dataset_manifest=os.path.join(WORKSPACE_DIR, \"librispeech\", \"test_other.json\"),\n", + " pretrained_name=\"stt_en_conformer_ctc_large_ls\",\n", + " batch_size=4,\n", + " output_filename=os.path.join(WORKSPACE_DIR, \"eval_results.json\"),\n", + "))\n", + "wer_results['generic'].append(eval_cfg.metric_value)\n", + "\n", + "eval_cfg = run_eval(EvaluationConfig(\n", + " dataset_manifest=os.path.join(WORKSPACE_DIR, \"slr83-data\", \"irish_english_male\", \"test_manifest.json\"),\n", + " pretrained_name=\"stt_en_conformer_ctc_large_ls\",\n", + " batch_size=4,\n", + " output_filename=os.path.join(WORKSPACE_DIR, \"eval_results.json\"),\n", + "))\n", + "wer_results['generic'].append(eval_cfg.metric_value)\n", + "\n", + "\n", + "# running evaluation with finetuned model\n", + "finetuned_model_path = glob.glob(os.path.join(WORKSPACE_DIR, \"irish_finetuning\", \"**\", \"*.nemo\"), recursive=True)[0]\n", + "eval_cfg = run_eval(EvaluationConfig(\n", + " dataset_manifest=os.path.join(WORKSPACE_DIR, \"librispeech\", \"test_other.json\"),\n", + " model_path=finetuned_model_path,\n", + " batch_size=4,\n", + " output_filename=os.path.join(WORKSPACE_DIR, \"eval_results.json\"),\n", + "))\n", + "wer_results['finetuned'].append(eval_cfg.metric_value)\n", + "\n", + "eval_cfg = run_eval(EvaluationConfig(\n", + " dataset_manifest=os.path.join(WORKSPACE_DIR, \"slr83-data\", \"irish_english_male\", \"test_manifest.json\"),\n", + " model_path=finetuned_model_path,\n", + " batch_size=4,\n", + " output_filename=os.path.join(WORKSPACE_DIR, \"eval_results.json\"),\n", + "))\n", + "wer_results['finetuned'].append(eval_cfg.metric_value)\n", + "\n", + "# you should be able to see that the generic model is much better\n", + "# on LibriSpeech and much worse on the accent data\n", + "print(\"\\n*************************** Results ***************************\\n\")\n", + "pd.DataFrame(wer_results, index=['LibriSpeech', 'Irish Accent']).transpose()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Building and evaluating ensemble (default parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# now let's finally combine the two models in the confidence-based ensemble!\n", + "# first, we are going to use default parameters (no tuning)\n", + "cmd = (\n", + " f\"cd {NEMO_DIR} && python scripts/confidence_ensembles/build_ensemble.py \"\n", + " # and example config is good enough for our purposes\n", + " f\"--config-path={NEMO_DIR}/scripts/confidence_ensembles --config-name=ensemble_config.yaml \"\n", + " # specifying model and corresponding dataset (to be used as ground-truth for logistic regression training)\n", + " \"ensemble.0.model=stt_en_conformer_ctc_large_ls \"\n", + " # by default it subsamples to a max of 1000 samples, so it's not going to use the full data\n", + " # note that for librispeech we are using the dev data - this is just to avoid downloading the training set\n", + " # it's perfectly fine and simpler to use the training data here\n", + " f\"ensemble.0.training_manifest={WORKSPACE_DIR}/librispeech/dev_other.json \"\n", + " # same for the second model/dataset\n", + " f\"ensemble.1.model={finetuned_model_path} \"\n", + " f\"ensemble.1.training_manifest={WORKSPACE_DIR}/slr83-data/irish_english_male/train_manifest.json \"\n", + " # setting up the final checkpoint location and lower batch size to save GPU memory\n", + " f\"output_path={WORKSPACE_DIR}/confidence_ensemble_default.nemo \"\n", + " \"transcription.batch_size=4 \"\n", + ")\n", + "\n", + "# building the ensemble\n", + "!$cmd\n", + "\n", + "# running evaluation on LibriSpeech and Irish accent data\n", + "# you will see that the transcription is run 2 times, since we need to run both models to get confidence scores\n", + "wer_results['ensemble (default)'] = []\n", + "eval_cfg = run_eval(EvaluationConfig(\n", + " dataset_manifest=os.path.join(WORKSPACE_DIR, \"librispeech\", \"test_other.json\"),\n", + " model_path=os.path.join(WORKSPACE_DIR, 'confidence_ensemble_default.nemo'),\n", + " batch_size=4,\n", + " output_filename=os.path.join(WORKSPACE_DIR, \"eval_results.json\"),\n", + "))\n", + "wer_results['ensemble (default)'].append(eval_cfg.metric_value)\n", + "\n", + "eval_cfg = run_eval(EvaluationConfig(\n", + " dataset_manifest=os.path.join(WORKSPACE_DIR, \"slr83-data\", \"irish_english_male\", \"test_manifest.json\"),\n", + " model_path=os.path.join(WORKSPACE_DIR, 'confidence_ensemble_default.nemo'),\n", + " batch_size=4,\n", + " output_filename=os.path.join(WORKSPACE_DIR, \"eval_results.json\"),\n", + "))\n", + "wer_results['ensemble (default)'].append(eval_cfg.metric_value)\n", + "\n", + "# you should be able to see that the ensemble with default parameters is already \n", + "# working very well. It might even be slightly better than the best model,\n", + "# because it can sometimes \"incorrectly\" pick generic model on Irish data\n", + "# when it's actually giving lower WER than the finetuned model (and same for LibriSpeech).\n", + "print(\"\\n*************************** Results ***************************\\n\")\n", + "pd.DataFrame(wer_results, index=['LibriSpeech', 'Irish Accent']).transpose()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Building and evaluating ensemble (tuned parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# now, we are going to allow tuning of the confidence and LR parameters to see how this affects results\n", + "# this cell is quite similar to the previous one - the only difference is in parameters of the\n", + "# command-line to build an ensemble.\n", + "\n", + "# for LibriSpeech, since we already used validation for training the logistic regression \n", + "# (to avoid downloading actual training data), we will create a new manifest with \n", + "# just 100 samples for training and another 100 for validation\n", + "!head -n 100 {WORKSPACE_DIR}/librispeech/dev_other.json > {WORKSPACE_DIR}/librispeech/dev_other_train100.json\n", + "!tail -n 100 {WORKSPACE_DIR}/librispeech/dev_other.json > {WORKSPACE_DIR}/librispeech/dev_other_dev100.json\n", + "\n", + "# we keep everything exactly the same, but specify a few additional config settings\n", + "cmd = (\n", + " f\"cd {NEMO_DIR} && python scripts/confidence_ensembles/build_ensemble.py \"\n", + " f\"--config-path={NEMO_DIR}/scripts/confidence_ensembles --config-name=ensemble_config.yaml \"\n", + " \"ensemble.0.model=stt_en_conformer_ctc_large_ls \"\n", + " f\"ensemble.0.training_manifest={WORKSPACE_DIR}/librispeech/dev_other_train100.json \"\n", + " f\"ensemble.1.model={finetuned_model_path} \"\n", + " f\"ensemble.1.training_manifest={WORKSPACE_DIR}/slr83-data/irish_english_male/train_manifest.json \"\n", + " # let's specify to just use 100 samples here as well to make tuning faster\n", + " # 100 is usually more than enough (remember that we are just fitting 2 parameters in the logistic regression)\n", + " # but default is 1000 just in case\n", + " f\"ensemble.1.max_training_samples=100 \"\n", + " # the tuning will take a bit more memory, so let's use bs=2 this time\n", + " \"transcription.batch_size=2 \"\n", + " # requesting to tune the confidence\n", + " # you can also specify exactly what grid-search to run here,\n", + " # but we'd just use the default (it's reasonably large)\n", + " \"tune_confidence=True \"\n", + " # need to provide the validation sets for the tuning\n", + " f\"ensemble.0.dev_manifest={WORKSPACE_DIR}/librispeech/dev_other_dev100.json \"\n", + " f\"ensemble.1.dev_manifest={WORKSPACE_DIR}/slr83-data/irish_english_male/dev_manifest.json \"\n", + " f\"output_path={WORKSPACE_DIR}/confidence_ensemble_tuned.nemo \"\n", + ")\n", + "\n", + "# building the ensemble. You should see that confidence computation step is \n", + "# taking quite a bit longer - this is where the grid search happens\n", + "!$cmd\n", + "\n", + "# running evaluation on LibriSpeech and Irish accent data\n", + "# you will see that the transcription is run 2 times, since we need to run both models to get confidence scores\n", + "wer_results['ensemble (tuned)'] = []\n", + "eval_cfg = run_eval(EvaluationConfig(\n", + " dataset_manifest=os.path.join(WORKSPACE_DIR, \"librispeech\", \"test_other.json\"),\n", + " model_path=os.path.join(WORKSPACE_DIR, 'confidence_ensemble_tuned.nemo'),\n", + " batch_size=4,\n", + " output_filename=os.path.join(WORKSPACE_DIR, \"eval_results.json\"),\n", + "))\n", + "wer_results['ensemble (tuned)'].append(eval_cfg.metric_value)\n", + "\n", + "eval_cfg = run_eval(EvaluationConfig(\n", + " dataset_manifest=os.path.join(WORKSPACE_DIR, \"slr83-data\", \"irish_english_male\", \"test_manifest.json\"),\n", + " model_path=os.path.join(WORKSPACE_DIR, 'confidence_ensemble_tuned.nemo'),\n", + " batch_size=4,\n", + " output_filename=os.path.join(WORKSPACE_DIR, \"eval_results.json\"),\n", + "))\n", + "wer_results['ensemble (tuned)'].append(eval_cfg.metric_value)\n", + "\n", + "# the tuned ensemble should be a bit better than default (but not too much)\n", + "# note that there is a bit of randomness in the finetuning and our dev set is quite small\n", + "# so it's possible that the tuned model can be similar to default or even slightly worse\n", + "# for the real applications it's recommended to use larger dev set,\n", + "# but tuning will take longer in this case\n", + "print(\"\\n*************************** Results ***************************\\n\")\n", + "pd.DataFrame(wer_results, index=['LibriSpeech', 'Irish Accent']).transpose()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 40c1cccc9eb6773243a5637ddaf0e6ff023d87e9 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Wed, 12 Jul 2023 10:00:05 -0700 Subject: [PATCH 099/123] Add support for Numba FP16 RNNT Loss (#6991) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Force working space memory to always be in fp32 Signed-off-by: smajumdar * Add support for fp16 testing in Numba Signed-off-by: smajumdar * Add support for fp16 testing in Numba Signed-off-by: smajumdar * Add support for fp16 testing in Numba Signed-off-by: smajumdar * Fix cost calculation by upcasting to fp32 Signed-off-by: smajumdar * Fix cost calculation by upcasting to fp32 Signed-off-by: smajumdar * Add support to check if numba fp16 is available Signed-off-by: smajumdar * add RNN-T loss implemented by PyTorch and test code (#5312) * Fix the bugs in cache-aware streaming Conformer (#5032) Signed-off-by: Vahid Signed-off-by: Hainan Xu * IA3 support for GPT and T5 (#4909) * init commit for ia3 adater training in GPT Signed-off-by: arendu * ia3 adater training in GPT, models and adapter classes Signed-off-by: arendu * reshape to operate even on non-contiguous tensors Signed-off-by: arendu * configs Signed-off-by: arendu * fixed none init Signed-off-by: arendu * adding adapter and ia3 support for T5 based models Signed-off-by: arendu * style fix Signed-off-by: arendu * config update and t5 model adapter and ia3 Signed-off-by: arendu * removed unused imports Signed-off-by: arendu * predict step for inference Signed-off-by: arendu * style fix Signed-off-by: arendu * style fix Signed-off-by: arendu * adapter inference for t5 Signed-off-by: arendu * style fix Signed-off-by: arendu * fixed bug micro and global batch size in eval Signed-off-by: arendu * minor edit Signed-off-by: arendu * agressive truncation if in test examples if no truncation field is given Signed-off-by: arendu * corrected for language_model_path name changes in main Signed-off-by: arendu * removed unused import Signed-off-by: arendu * name change for language_model_path Signed-off-by: arendu * include inter_attention to IA3 Signed-off-by: arendu * minor fix in confg Signed-off-by: arendu * minor fixes Signed-off-by: arendu * removed unused flag Signed-off-by: arendu * addressing PR comments Signed-off-by: arendu * address PR comments Signed-off-by: arendu * minor fix Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * style fix Signed-off-by: arendu * CI test Signed-off-by: arendu * minor fix in jenkinsfile Signed-off-by: arendu Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Hainan Xu * Bug fix - Limit val batches set to 1.0 (#5023) * Bug fix Signed-off-by: shanmugamr1992 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Adressed sandeep's comments * Fixing limit val batches support in bert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixing limit val batches support in bert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: shanmugamr1992 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Sandeep Subramanian Signed-off-by: Hainan Xu * [bug_fix] kv_channels is used when available (#5066) * fix bug s.t kv_channels is used when available Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Hainan Xu * P&C Docs (#5068) (#5069) Signed-off-by: Matvei Novikov Signed-off-by: Matvei Novikov Signed-off-by: Matvei Novikov Co-authored-by: Matvei Novikov Signed-off-by: Hainan Xu * Add spe_split_by_unicode_script arg (#5072) * Add spe_split_by_unicode_script arg Signed-off-by: Anas * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Anas Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Hainan Xu * probabilites -> probabilities (#5078) (#5079) Signed-off-by: nithinraok Signed-off-by: nithinraok Signed-off-by: nithinraok Co-authored-by: Nithin Rao Signed-off-by: Hainan Xu * increase PR and Issue sweep quantity and active close PRs. (#5073) * increase PR and Issue sweep quantity and active close PRs. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * update with stricter rules, 30 days to be stale and 7 days to be closed for both Issues and PRs. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Hainan Xu * [TTS] added missing German phoneme tokenizer. (#5070) (#5074) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Hainan Xu * rename to match prompt leanring (#5076) Signed-off-by: arendu Signed-off-by: arendu Signed-off-by: Hainan Xu * Missing fixes from r1.11.0 to T5 finetuning eval (#5054) (#5061) * Fixes to seq2seq eval Signed-off-by: MaximumEntropy * Style Signed-off-by: MaximumEntropy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: MaximumEntropy Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: MaximumEntropy Co-authored-by: Sandeep Subramanian Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Hainan Xu * Notebook bug fixes (#5084) (#5085) * Notebook bug fixes Signed-off-by: Virginia Adams * Turned nemo install back on Signed-off-by: Virginia Adams * reverted notebook Signed-off-by: Virginia Adams * Updated one line in entity linking nb Signed-off-by: Virginia Adams Signed-off-by: Virginia Adams Co-authored-by: Eric Harper Signed-off-by: Virginia Adams Co-authored-by: Virginia Adams <78445382+vadam5@users.noreply.github.com> Co-authored-by: Eric Harper Signed-off-by: Hainan Xu * update strategy in notebook from ddp_fork to dp (#5088) (#5089) Co-authored-by: Zhilin Wang Signed-off-by: Hainan Xu * Fix bug in Squeezeformer Conv block (#5011) (#5024) * Fix bug in Squeezeformer Conv block Signed-off-by: smajumdar * Fix kernel context Signed-off-by: smajumdar * Fix access mixin Signed-off-by: smajumdar Signed-off-by: smajumdar Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar Signed-off-by: Hainan Xu * fixed megatron lm conversion bug (PTL related) (#5038) (#5063) Signed-off-by: David Mosallanezhad Signed-off-by: David Mosallanezhad Co-authored-by: David Mosallanezhad Signed-off-by: David Mosallanezhad Co-authored-by: David Co-authored-by: David Mosallanezhad Co-authored-by: Eric Harper Signed-off-by: Hainan Xu * Fix Unhashable type list for Numba Cuda spec augment kernel (#5093) (#5094) Signed-off-by: smajumdar Signed-off-by: smajumdar Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar Signed-off-by: Hainan Xu * Fix numba (#5098) Signed-off-by: smajumdar Signed-off-by: smajumdar Signed-off-by: Hainan Xu * Make it possible to specify output_filename in normalize_with_audio.py (#5092) Signed-off-by: Elena Rastorgueva Signed-off-by: Elena Rastorgueva Signed-off-by: Hainan Xu * Greedy decoding confidence for CTC and RNNT (#4931) * rnnt confidence draft Signed-off-by: Aleksandr Laptev * word confidence Signed-off-by: Aleksandr Laptev * advanced entropies added Signed-off-by: Aleksandr Laptev * refactoring Signed-off-by: Aleksandr Laptev * oops forgot a file Signed-off-by: Aleksandr Laptev * metrics and benchmarking script added Signed-off-by: Aleksandr Laptev * style fix Signed-off-by: Aleksandr Laptev * texterrors installation added Signed-off-by: Aleksandr Laptev * lgtm and bug fix Signed-off-by: Aleksandr Laptev * fix comments Signed-off-by: Aleksandr Laptev * fix typos Signed-off-by: Aleksandr Laptev * add missing import after rebase Signed-off-by: Aleksandr Laptev Signed-off-by: Aleksandr Laptev Co-authored-by: Aleksandr Laptev Signed-off-by: Hainan Xu * [Add] SLURP models and examples (#4668) * add model, util and loss Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * refactor Signed-off-by: stevehuang52 * refactor annd update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update and refactor Signed-off-by: stevehuang52 * update and refactor Signed-off-by: stevehuang52 * update and refactor Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * update docs Signed-off-by: stevehuang52 * update available models Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * refactor data processing Signed-off-by: stevehuang52 * fix typo Signed-off-by: stevehuang52 * update docs Signed-off-by: stevehuang52 * refactor and update Signed-off-by: stevehuang52 * update doc Signed-off-by: stevehuang52 * move transformer to asr.modules Signed-off-by: stevehuang52 * move transformer to asr.modules Signed-off-by: stevehuang52 * get rid of jsonlines Signed-off-by: stevehuang52 * refactor Signed-off-by: stevehuang52 * revert changes to nlp Signed-off-by: stevehuang52 Signed-off-by: stevehuang52 Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: Jagadeesh Balam <4916480+jbalam-nv@users.noreply.github.com> Signed-off-by: Hainan Xu * only optimize params that are part of the adapter modules (#5086) Signed-off-by: arendu Signed-off-by: arendu Co-authored-by: Virginia Adams <78445382+vadam5@users.noreply.github.com> Signed-off-by: Hainan Xu * Pipeline Parallel T5 Prompt Learning (#4956) * Added pre process flag checks and pipeline parallel in fwd Signed-off-by: Virginia Adams * Added rank check for pipeline parallel Signed-off-by: Virginia Adams * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * T5 prompt learning works! Signed-off-by: Virginia Adams * IA3 passing CI Signed-off-by: Virginia Adams * Fixed typo Signed-off-by: Virginia Adams * removed optimizer setup so Adi's change will not conflict Signed-off-by: Virginia Adams Signed-off-by: Virginia Adams Signed-off-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com> Co-authored-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Hainan Xu * [TTS] remove phonemizer.py (#5090) remove phonemizer.py and convert code block to markdown in the tutorial. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Hainan Xu * T5 Decoding with PP > 2 fix (#5091) (#5103) * set sequence lenghts in the pipeline properly Signed-off-by: MaximumEntropy * Fix Signed-off-by: MaximumEntropy Signed-off-by: MaximumEntropy Signed-off-by: MaximumEntropy Co-authored-by: Sandeep Subramanian Signed-off-by: Hainan Xu * [TTS] fixed wrong val loss for epoch 0 and inconsistent metrics names (#5087) (#5102) * fixed hifigan configs as well * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Hainan Xu * Fix and refactor consumed samples save/restore for Megatron models. (#5077) * Fixes and refactor Signed-off-by: MaximumEntropy * Fix Signed-off-by: MaximumEntropy * Remove unused imports Signed-off-by: MaximumEntropy * Empty Signed-off-by: MaximumEntropy * Fix Signed-off-by: MaximumEntropy Signed-off-by: MaximumEntropy Signed-off-by: Hainan Xu * RIR corpus generator tool (#4927) Signed-off-by: Ante Jukić Signed-off-by: Ante Jukić Signed-off-by: Hainan Xu * Multiprocessing fix (#5106) (#5107) Signed-off-by: Matvei Novikov Signed-off-by: Matvei Novikov Signed-off-by: Matvei Novikov Co-authored-by: Matvei Novikov Signed-off-by: Hainan Xu * [Bug fix] PC lexical + audio (#5109) (#5110) * training running Signed-off-by: ekmb * revert Signed-off-by: ekmb * revert Signed-off-by: ekmb Signed-off-by: ekmb Signed-off-by: ekmb Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> Signed-off-by: Hainan Xu * [Fix] schedulers with no max_steps param (#4564) * fix schedulers Signed-off-by: stevehuang52 * update to use python inspect module Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 Signed-off-by: stevehuang52 Signed-off-by: Hainan Xu * T5 prompt learning fixes missing from r.11.0 merge (#5075) (#5101) * Fix special tokens Signed-off-by: MaximumEntropy * Fix Signed-off-by: MaximumEntropy * Empty Signed-off-by: MaximumEntropy Signed-off-by: MaximumEntropy Co-authored-by: David Signed-off-by: MaximumEntropy Co-authored-by: Sandeep Subramanian Co-authored-by: David Co-authored-by: Eric Harper Signed-off-by: Hainan Xu * [TTS] Add NeMo TTS Primer Tutorial (#4933) * [TTS] Add NeMo TTS Primer Tutorial Signed-off-by: Ryan Signed-off-by: Hainan Xu * Add Squeezeformer CTC model checkpoints on Librispeech (#5121) Signed-off-by: smajumdar Signed-off-by: smajumdar Signed-off-by: Hainan Xu * adding loss normalization options to rnnt joint (#4829) * adding normalization options to rnnt joint loss * moving the param to joint * moving loss normalization to rnnt loss config * style * cleaning up * fixing sum reduction in joint Signed-off-by: Dima Rekesh * moving reduction into RNNT loss class * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refactoring * typos Signed-off-by: Dima Rekesh Signed-off-by: Dima Rekesh Co-authored-by: Dima Rekesh Co-authored-by: Oleksii Kuchaiev Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Hainan Xu * Asr concat dataloader (#5108) * forced precision * typo * initial commit Signed-off-by: Dima Rekesh * typos and bugs Signed-off-by: Dima Rekesh * reverting conformer encoder Signed-off-by: Dima Rekesh * additional checks Signed-off-by: Dima Rekesh * adding support to CTC models as well * reverting conformer_encoder Signed-off-by: Dima Rekesh * typo Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refactoring Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refactoring Signed-off-by: Dima Rekesh * merging Signed-off-by: Dima Rekesh Signed-off-by: Dima Rekesh Signed-off-by: Dima Rekesh Co-authored-by: Dima Rekesh Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Somshubra Majumdar Signed-off-by: Hainan Xu * fix blossom ci unittests Signed-off-by: Oleksii Kuchaiev Signed-off-by: Hainan Xu * bugfix: pybtex.database.InvalidNameString: Too many commas in author field. (#5112) (#5115) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Hainan Xu * Uppdate container version to 22.09 (#5105) * update container version Signed-off-by: ericharper * pin click Signed-off-by: ericharper * pin click 8.0.2 Signed-off-by: ericharper Signed-off-by: ericharper Signed-off-by: Hainan Xu * Remove unsupported arguments from MegatronNMT (#5065) * Fixes Signed-off-by: MaximumEntropy * Fixes Signed-off-by: MaximumEntropy * Style Signed-off-by: MaximumEntropy * Fix Signed-off-by: MaximumEntropy * More fixes Signed-off-by: MaximumEntropy Signed-off-by: MaximumEntropy Signed-off-by: Hainan Xu * pp2 support for T5 IA3 learning and T5 Adapters learning (#5116) * enabling pp2 Signed-off-by: arendu * optimizer update Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * T5 pp>1 support for adapters and ia3 Signed-off-by: arendu * fix bug with missing adapter_tuning Signed-off-by: arendu * inference error fixed, pp=2 Signed-off-by: arendu Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Oleksii Kuchaiev Signed-off-by: Hainan Xu * T5 Prompt Learning Fixes for Pipeline Parallel (#5120) * Initial fixes Signed-off-by: MaximumEntropy * Added back validation acc Signed-off-by: Virginia Adams * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Put num workers back Signed-off-by: Virginia Adams * added relative encoding if statament Signed-off-by: Virginia Adams * Added back val loss only validation Signed-off-by: Virginia Adams * Revert "Added back val loss only validation" This reverts commit 86d8f4806fe30335c40c3716ce18259939df500f. * Removed val acc for PP > 1 Signed-off-by: Virginia Adams * Removed enc_seq_len if statement Signed-off-by: Virginia Adams * Added back validation acc calc Signed-off-by: Virginia Adams * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: MaximumEntropy Signed-off-by: Virginia Adams Signed-off-by: Virginia Adams Co-authored-by: Virginia Adams Co-authored-by: Virginia Adams <78445382+vadam5@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Virginia Adams Signed-off-by: Hainan Xu * add doc info (#4721) Signed-off-by: Yang Zhang Signed-off-by: Yang Zhang Signed-off-by: Hainan Xu * [TTS] Add SpanishCharsTokenizer (#5135) * [TTS] Add SpanishCharsTokenizer Signed-off-by: Ryan Signed-off-by: Hainan Xu * Update megatron interface to dialogue (#4936) * fix style formatting Signed-off-by: Zhilin Wang * update template to include description of intent Signed-off-by: Zhilin Wang * update Jenkinsfile Signed-off-by: Zhilin Wang * changes based on requests in review Signed-off-by: Zhilin Wang * add compatibility with assistant dataset Signed-off-by: Zhilin Wang * update Jenkins Signed-off-by: Zhilin Wang * remove dialogue_state_tracking Signed-off-by: Zhilin Wang * update huggingface utils for dialogue Signed-off-by: Zhilin Wang * rename dialogue_state_tracking_hybrid to dialogue_state_tracking_sgdqa Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * fix style Signed-off-by: Zhilin Wang * style fix nemo/collections/nlp/models/dialogue_state_tracking_sgdqa/__init__.py Signed-off-by: Zhilin Wang * update Jenkinsfile for SGDGEN Signed-off-by: Zhilin Wang * update Jenkinsfile for SGDGEN Signed-off-by: Zhilin Wang * update Jenkinsfile for SGDGEN Signed-off-by: Zhilin Wang * update Jenkinsfile for SGDGEN Signed-off-by: Zhilin Wang * update Jenkinsfile for SGDGEN Signed-off-by: Zhilin Wang * fix typo Signed-off-by: Zhilin Wang * add docstrings for assistant data processsor Signed-off-by: Zhilin Wang * update Jenkins for SGDGEN local checkpoint Signed-off-by: Zhilin Wang * update style Signed-off-by: Zhilin Wang * use local vocab file for Jenkinsfile Signed-off-by: Zhilin Wang * patch for Jenkins CI using local file Signed-off-by: Zhilin Wang * add slot filling prediction and metrics Signed-off-by: Zhilin Wang * remove unused code Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * refactor metrics code out of Dialogue GPT Model Signed-off-by: Zhilin Wang * integrate backward compatible support for IntentSlotClassificationModel (bert model) Signed-off-by: Zhilin Wang * save prediction file for IntentSlotClassification Signed-off-by: Zhilin Wang * update dialogue gpt model training for megatron gpt Signed-off-by: Zhilin Wang * remove batch generate for HF GPT2, which causes lower performance Signed-off-by: Zhilin Wang * add few shot capability to dialogue gpt model Signed-off-by: Zhilin Wang * update Jenkinsfile and remove unused import Signed-off-by: Zhilin Wang * update code description and clarity Signed-off-by: Zhilin Wang * address PR comments Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * integrate compatibility with ZeroShotIntentModel Signed-off-by: Zhilin Wang * rename folder to dialogue due to increased scope and further refactor for clarity Signed-off-by: Zhilin Wang * added dialogue GPT for sequence generation task (e.g. answer extender) Signed-off-by: Zhilin Wang * add CI test for DialogueGPTGenerationModel Signed-off-by: Zhilin Wang * integrate DialogueS2SGenerationModel for generation task (e.g. answer extender) Signed-off-by: Zhilin Wang * modify huggingface utils to support HF t5/BART models Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * remove unused imports Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * update Jenkinsfile Signed-off-by: Zhilin Wang * update Jenkinsfile Signed-off-by: Zhilin Wang * update bleu metric Signed-off-by: Zhilin Wang * fix bleu metric style Signed-off-by: Zhilin Wang * debug bleu metric Signed-off-by: Zhilin Wang * debug bleu metric Signed-off-by: Zhilin Wang * update based on PR #3893 Signed-off-by: Zhilin Wang * update 2 based on PR #3893 Signed-off-by: Zhilin Wang * update 3 based on PR #3893 Signed-off-by: Zhilin Wang * integrate sgd generation based on user user utterance and system slot-values to generate system utterance Signed-off-by: Zhilin Wang * add validation model saving capabilities Signed-off-by: Zhilin Wang * cleaned up code for SGD Based Answer extender Signed-off-by: Zhilin Wang * update Dialogue Generation CI Signed-off-by: Zhilin Wang * update Jenkinsfile Signed-off-by: Zhilin Wang * update Jenkinsfile Signed-off-by: Zhilin Wang * fix Jenkins CI issue" Signed-off-by: Zhilin Wang * add support for design dataset Signed-off-by: Zhilin Wang * remove unnecessary imports Signed-off-by: Zhilin Wang * update Jenkins Signed-off-by: Zhilin Wang * update jenkins Signed-off-by: Zhilin Wang * update jenkins Signed-off-by: Zhilin Wang * support megatron for dialogue_s2s_generation_model Signed-off-by: Zhilin Wang * reduce loaded samples in MSMarcoDataProcessor to 64 when cfg.model.dataset.debug_mode=True Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * update CI Signed-off-by: Zhilin Wang * update checkpoint and predictions filename to include epoch number Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * integrate HF BART MNLI into zero shot intent model Signed-off-by: Zhilin Wang * integrate Dialogue Nearest Neighbour Model Signed-off-by: Zhilin Wang * update Jenkins Signed-off-by: Zhilin Wang * update Jenkins Signed-off-by: Zhilin Wang * refactor Dialogue SGD Data Processor to make interface for models cleaner Signed-off-by: Zhilin Wang * update jenkins Signed-off-by: Zhilin Wang * update Dialogue S2S Generation model for DialogueSGDDataProcessor interface Signed-off-by: Zhilin Wang * update jenkins Signed-off-by: Zhilin Wang * update jenkins Signed-off-by: Zhilin Wang * support sgd and drive thru datasets by zero shot model and nearest neighbour model Signed-off-by: Zhilin Wang * add prediction saving code to nearest neighbour and zero shot intent models Signed-off-by: Zhilin Wang * fix typo in sgd data processor Signed-off-by: Zhilin Wang * integrate Dialogue Mellon QA Data Processor Signed-off-by: Zhilin Wang * update mellon qa Signed-off-by: Zhilin Wang * update dialogue.py to remove outdated info Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * update dialogue_config.yaml Signed-off-by: Zhilin Wang * update dialogue_config.yaml Signed-off-by: Zhilin Wang * add dialogue docs Signed-off-by: Zhilin Wang * address review comments Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * style fix for cfg Signed-off-by: Zhilin Wang * make dependency on apex optional Signed-off-by: Zhilin Wang * change NLPDDPluggin calling logic to make it possible to run without apex Signed-off-by: Zhilin Wang * add first draft of tutorial Signed-off-by: Zhilin Wang * reduce ms marco size by removing lines without wellFormedAnswers Signed-off-by: Zhilin Wang * address pr comments Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * update colab tutorial link in dialogue docs Signed-off-by: Zhilin Wang * include unit test and some refactor to facilitate unit test Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * address pr issues Signed-off-by: Zhilin Wang * remove typos in dialogue tutorial Signed-off-by: Zhilin Wang * support larger files for question answering Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * remove unnecessary artifacts to reduce memory use Signed-off-by: Zhilin Wang * put 0 tensor to device Signed-off-by: Zhilin Wang * update link within dialogue tutorial Signed-off-by: Zhilin Wang * restore previously delete files Signed-off-by: Zhilin Wang * update error handling when loss = nan Signed-off-by: Zhilin Wang * update nan handling Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * update spanning loss func Signed-off-by: Zhilin Wang * update spanning loss Signed-off-by: Zhilin Wang * fix type error raised in qa_dataset.py Signed-off-by: Zhilin Wang * add error checking message Signed-off-by: Zhilin Wang * revert back to float32 Signed-off-by: Zhilin Wang * revert back to float32 Signed-off-by: Zhilin Wang * update error msgs Signed-off-by: Zhilin Wang * update error msgs Signed-off-by: Zhilin Wang * update error msgs Signed-off-by: Zhilin Wang * update error msgs Signed-off-by: Zhilin Wang * update error msgs Signed-off-by: Zhilin Wang * update error msgs Signed-off-by: Zhilin Wang * update error msgs Signed-off-by: Zhilin Wang * update error msgs Signed-off-by: Zhilin Wang * update exp logging Signed-off-by: Zhilin Wang * update error msgs Signed-off-by: Zhilin Wang * update loading of large file from pickle to json Signed-off-by: Zhilin Wang * update loading of large file from pickle to json Signed-off-by: Zhilin Wang * limit number of negative samples Signed-off-by: Zhilin Wang * revert post processing Signed-off-by: Zhilin Wang * revert post processing Signed-off-by: Zhilin Wang * remove unused methods and style fix Signed-off-by: Zhilin Wang * add more documentation Signed-off-by: Zhilin Wang * remove unused imports Signed-off-by: Zhilin Wang * changes base on PR review Signed-off-by: Zhilin Wang * set wandb logger falseby default Signed-off-by: Zhilin Wang * update interface with megatron gpt prompt learning Signed-off-by: Zhilin Wang * update inline documentation Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * update prompt_ids Signed-off-by: Zhilin Wang * update error msg Signed-off-by: Zhilin Wang * update config Signed-off-by: Zhilin Wang * update config Signed-off-by: Zhilin Wang * set inference = False for dialgue prompt learning during trainng Signed-off-by: Zhilin Wang * set inference = False for dialgue prompt learning during trainng Signed-off-by: Zhilin Wang * remove unused code Signed-off-by: Zhilin Wang * update config yaml Signed-off-by: Zhilin Wang * fix bug for megatron gpt prompt learning Signed-off-by: Zhilin Wang * remove unused import Signed-off-by: Zhilin Wang * address comments in PR Signed-off-by: Zhilin Wang * address comments in PR Signed-off-by: Zhilin Wang * address typo Signed-off-by: Zhilin Wang * add megatron t5 inference Signed-off-by: Zhilin Wang * fix bug due to bert tokenizer not being space-aware Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * update style Signed-off-by: Zhilin Wang * update IntentSlotModel onnx export test Signed-off-by: Zhilin Wang * update style Signed-off-by: Zhilin Wang * update exportable Signed-off-by: Zhilin Wang * address PR comments Signed-off-by: Zhilin Wang * replace functools.cache_property with functools.lru_cache to maintain python 3.7 compatibility Signed-off-by: Zhilin Wang * improve speed of rank_candidates and support for p tuning Signed-off-by: Zhilin Wang * update dialogue.py Signed-off-by: Zhilin Wang * fix megatron prompt learning saving bug Signed-off-by: Zhilin Wang * update generate_candidate method Signed-off-by: Zhilin Wang * remove repeated init text ids and invert attention masks Signed-off-by: Zhilin Wang * update typo Signed-off-by: Zhilin Wang * custom collate fn to remove excess padding in batch Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * style fix Signed-off-by: Zhilin Wang * update complete method to mitigate issue when max seq len is low Signed-off-by: Zhilin Wang * address pr comments Signed-off-by: Zhilin Wang * update generation interface Signed-off-by: Zhilin Wang Signed-off-by: Zhilin Wang Co-authored-by: Zhilin Wang Co-authored-by: Oleksii Kuchaiev Co-authored-by: Yang Zhang Co-authored-by: Eric Harper Co-authored-by: Sandeep Subramanian Signed-off-by: Hainan Xu * Added save inference ready .nemo file with every checkpoint (#5055) * Added save inference ready .nemo file with every checkpoint Signed-off-by: Virginia Adams * Python style fix Signed-off-by: Virginia Adams * addressed Adi's comment Signed-off-by: Virginia Adams * Added ptuning check in model checkpoint saving Signed-off-by: Virginia Adams * Changed save_nemo_on_valdaition default to False Signed-off-by: Virginia Adams * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Changes global batch size of adapter CI Signed-off-by: Virginia Adams * Changed num workers to 0 Signed-off-by: Virginia Adams * added first stage of pipeline check Signed-off-by: Virginia Adams * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Virginia Adams Signed-off-by: Virginia Adams <78445382+vadam5@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Hainan Xu * Fixes for docs/typos + remove max_utts parameter from tarred datasets as it causes hang in training (#5118) * Remove ; from jupyter notebook cells Signed-off-by: Igor Gitman * Fix typos in documentation/code Signed-off-by: Igor Gitman * Fix output message to have 'or equal' Signed-off-by: Igor Gitman * Link formatting fixes Signed-off-by: Igor Gitman * Add error if max_utts is used in tarred datasets Signed-off-by: Igor Gitman * Remove max_utts parameter from tarred datasets Signed-off-by: Igor Gitman * Fix max_utts removal in tests Signed-off-by: Igor Gitman * Fix typo if -> is Signed-off-by: Igor Gitman Signed-off-by: Igor Gitman Signed-off-by: Hainan Xu * Merge r1.12.0 main (#5139) * update branch Signed-off-by: ericharper * Add cherry-pick action (#4958) * add cherry-pick action Signed-off-by: ericharper * Pin Transformers version to fix CI (#4955) * Pin transformers version in CI to prevent offline tokenizer loading error Signed-off-by: SeanNaren * Drop version Signed-off-by: SeanNaren * Disable offline temporarily Signed-off-by: SeanNaren * Disable offline temporarily Signed-off-by: SeanNaren * Enable offline Signed-off-by: SeanNaren Signed-off-by: SeanNaren Signed-off-by: ericharper Signed-off-by: SeanNaren Co-authored-by: Sean Naren * upper bound transformers Signed-off-by: ericharper * remove duplicate transformers requirement Signed-off-by: ericharper * Release SOTA Lang ID model (#5080) * add pretrained lang id model ambernet Signed-off-by: fayejf * update doc and style fix Signed-off-by: fayejf Signed-off-by: fayejf * update branch and package info Signed-off-by: ericharper * remove upper bounds on lightning and transformers Signed-off-by: ericharper * remove transformers offline from ci Signed-off-by: ericharper * upper bound transformers Signed-off-by: ericharper Signed-off-by: ericharper Signed-off-by: SeanNaren Signed-off-by: fayejf Co-authored-by: Sean Naren Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> Signed-off-by: Hainan Xu * Added ASR model comparison to SDE (#5043) SDE: Added ASR model comparison tool to SDE transcribe speech: Added support for many predictions in one file, as well as custom field names Signed-off-by: George Zelenfroynd Signed-off-by: Hainan Xu * fix nmt eval sampler (#5154) Signed-off-by: Abhinav Khattar Signed-off-by: Abhinav Khattar Signed-off-by: Hainan Xu * Fix Global init steps (#5143) * move global step to base Signed-off-by: Yi Dong * fix fused softmax Signed-off-by: Yi Dong * add the missing file Signed-off-by: Yi Dong * update the fused kernel Signed-off-by: Yi Dong * fix import error Signed-off-by: Yi Dong * fix import again Signed-off-by: Yi Dong Signed-off-by: Yi Dong Signed-off-by: Yi Dong Co-authored-by: Yi Dong Co-authored-by: Sandeep Subramanian Signed-off-by: Hainan Xu * [TTS] bug fix - sample rate was being ignored in vocoder dataset (#4518) * bug fix - sample rate was being ignored in vocoder dataset when not loading mel * handled n segments for a different sampling rate than original sampling rate * Added case for n_segments 0, warning for n_segments greater than file length Signed-off-by: Paarth Neekhara Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Jocelyn Signed-off-by: Hainan Xu * Add EMA support to NeMo (#4764) * Added Base files Signed-off-by: SeanNaren * Some refactors, swap to using MNIST Lnet Signed-off-by: SeanNaren * Add a few more tests, allow the callback to be set via the exp manager Signed-off-by: SeanNaren * Actually run validation for testing Signed-off-by: SeanNaren * Run isort Signed-off-by: SeanNaren * Add test for saving state/fix saving state Signed-off-by: SeanNaren * Use dummy model Signed-off-by: SeanNaren * Fix test Signed-off-by: SeanNaren * Add copyright Signed-off-by: SeanNaren * Support saving separate EMA weight module Signed-off-by: SeanNaren * Add standalone functionality/logging Signed-off-by: SeanNaren * Expose more parameters Signed-off-by: SeanNaren * Modify to allow option to replace validation Signed-off-by: SeanNaren * Add jenkins test, formatting Signed-off-by: SeanNaren * Pin Transformers version to fix CI (#4955) * Pin transformers version in CI to prevent offline tokenizer loading error Signed-off-by: SeanNaren * Drop version Signed-off-by: SeanNaren * Disable offline temporarily Signed-off-by: SeanNaren * Disable offline temporarily Signed-off-by: SeanNaren * Enable offline Signed-off-by: SeanNaren Signed-off-by: SeanNaren * Add cherry-pick action (#4958) (#4961) * add cherry-pick action Signed-off-by: ericharper * Pin Transformers version to fix CI (#4955) * Pin transformers version in CI to prevent offline tokenizer loading error Signed-off-by: SeanNaren * Drop version Signed-off-by: SeanNaren * Disable offline temporarily Signed-off-by: SeanNaren * Disable offline temporarily Signed-off-by: SeanNaren * Enable offline Signed-off-by: SeanNaren Signed-off-by: SeanNaren Signed-off-by: ericharper Signed-off-by: SeanNaren Co-authored-by: Sean Naren Signed-off-by: ericharper Signed-off-by: SeanNaren Co-authored-by: Eric Harper Co-authored-by: Sean Naren Signed-off-by: SeanNaren * Fix changelog builder (#4962) (#4963) Signed-off-by: smajumdar Signed-off-by: smajumdar Signed-off-by: smajumdar Signed-off-by: SeanNaren * fix cherry pick workflow (#4964) (#4965) Signed-off-by: ericharper Signed-off-by: ericharper Signed-off-by: ericharper Co-authored-by: Eric Harper Signed-off-by: SeanNaren * reorder model check (#4959) (#4967) Signed-off-by: nithinraok Signed-off-by: nithinraok Signed-off-by: nithinraok Co-authored-by: Nithin Rao Signed-off-by: SeanNaren * check for active conda environment (#4970) (#4971) Signed-off-by: SeanNaren * [TTS] fix broken tutorial for MixerTTS. (#4949) (#4976) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: SeanNaren * Checkpoint averaging class fix (#4946) * 1. Added args.class_path to provide it externally. Signed-off-by: Micha Livne * 1. Fixed style. Signed-off-by: Micha Livne Signed-off-by: Micha Livne Signed-off-by: SeanNaren * Add ability to give seperate datasets for test, train and validation (#4798) * Add ability to give seperate datasets for test, train and validation * Addressed Sandeeps comments * Addressed Sandeeps comments * Add ability to give seperate datasets for test, train and validation * Add ability to give seperate datasets for test, train and validation * Addressed review comments * Bug fix for common dataset utils * Add CI tests Signed-off-by: shanmugamr1992 * Reformat code Signed-off-by: shanmugamr1992 * Bug fix Signed-off-by: shanmugamr1992 * Bug fix * Bug Fix * Bug Fix * Update Jenkinsfile * Addressed comments * Addressed Eriks comments. * Addressed Sandeep * Update Jenkinsfile * Update Jenkinsfile * Update dataset_utils.py * Update Jenkinsfile * Update Jenkinsfile * Use GPT CI config Signed-off-by: MaximumEntropy Signed-off-by: shanmugamr1992 Signed-off-by: MaximumEntropy Co-authored-by: MaximumEntropy Signed-off-by: SeanNaren * fix label models restoring issue from wrighted cross entropy (#4968) (#4975) Signed-off-by: nithinraok Signed-off-by: nithinraok Signed-off-by: nithinraok Co-authored-by: Nithin Rao Signed-off-by: SeanNaren * Add simple pre-commit file (#4983) * Add simple pre-commit file Signed-off-by: SeanNaren * Exclude docs folder Signed-off-by: SeanNaren * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: SeanNaren * Revert "[pre-commit.ci] auto fixes from pre-commit.com hooks" This reverts commit 053bd5ba579537a5f311b431871c21f3381b43eb. Signed-off-by: SeanNaren * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: SeanNaren Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: SeanNaren * Import pycuda.autoprimaryctx or pycuda.autoinit to init pycuda execution environment (#4951) Signed-off-by: Jin Li Signed-off-by: Jin Li Co-authored-by: Somshubra Majumdar Signed-off-by: SeanNaren * Adding speaker embedding conditioning in fastpitch (#4986) Signed-off-by: subhankar-ghosh Signed-off-by: subhankar-ghosh Signed-off-by: SeanNaren * Fix ASR issues (#4984) (#4991) * Fix ASR issues Signed-off-by: smajumdar * Revert fix Signed-off-by: smajumdar Signed-off-by: smajumdar Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar Signed-off-by: SeanNaren * Fix current tests Signed-off-by: SeanNaren * More test coverage Signed-off-by: SeanNaren * Address reviews Signed-off-by: SeanNaren * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Address review Signed-off-by: SeanNaren * Drop bf16 test Signed-off-by: SeanNaren * Address review Signed-off-by: SeanNaren * remove print Signed-off-by: SeanNaren * Add bf16 Signed-off-by: SeanNaren Signed-off-by: SeanNaren Signed-off-by: ericharper Signed-off-by: smajumdar Signed-off-by: nithinraok Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Micha Livne Signed-off-by: shanmugamr1992 Signed-off-by: MaximumEntropy Signed-off-by: Jin Li Signed-off-by: subhankar-ghosh Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Eric Harper Co-authored-by: Somshubra Majumdar Co-authored-by: Nithin Rao Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Micha Livne Co-authored-by: shanmugamr1992 <111910568+shanmugamr1992@users.noreply.github.com> Co-authored-by: MaximumEntropy Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: liji-nv <59594262+liji-nv@users.noreply.github.com> Co-authored-by: Subhankar Ghosh Signed-off-by: Hainan Xu * Fix BF16 test (#5162) Signed-off-by: SeanNaren Signed-off-by: SeanNaren Signed-off-by: Hainan Xu * Fix errors in speaker diarization nemo docs (#5153) * fix docs and docstrings for MSDD Signed-off-by: Taejin Park * fix nemo docs errors Signed-off-by: Taejin Park * reflected review comments Signed-off-by: Taejin Park Signed-off-by: Taejin Park Signed-off-by: Hainan Xu * Add interleaved pipeline schedule to GPT (#5025) * add virtual pipeline size to config Signed-off-by: ericharper * convert model to list of modules Signed-off-by: ericharper * convert model to list of modules Signed-off-by: ericharper * convert model to list of modules Signed-off-by: ericharper * update for list of modules Signed-off-by: ericharper * add virtual to init Signed-off-by: ericharper * update first last stage embedding all reduce Signed-off-by: ericharper * update sequence parallel all reduce for virtual models Signed-off-by: ericharper * runs but we get an error Signed-off-by: ericharper * set virtual rank 0 after looping Signed-off-by: ericharper * account for virtual when determinining first and last pipeline stages Signed-off-by: ericharper * checkpointing for virtual models in progress Signed-off-by: ericharper * add checkpoint hooks Signed-off-by: ericharper * working on validation when resuming Signed-off-by: ericharper * skip sanity val steps by default in config Signed-off-by: ericharper * remove comment Signed-off-by: ericharper * log number of params Signed-off-by: ericharper * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * style Signed-off-by: ericharper * check if self.model is a list Signed-off-by: ericharper * make virtual pipeline default size None on init Signed-off-by: ericharper * make virtual pipeline default to None in config Signed-off-by: ericharper * remove ensure_divisibility call Signed-off-by: ericharper * fix lgtm alerts Signed-off-by: ericharper * remove num_sanity_val_steps from config Signed-off-by: ericharper * default virtual pipeline size to none Signed-off-by: ericharper * check for list Signed-off-by: ericharper * update assert to make sure we are only doing virtual for gpt Signed-off-by: ericharper * revert change to get_params_for_weight_decay Signed-off-by: ericharper * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * init var Signed-off-by: ericharper * add import guard for set virtual model parallel world size Signed-off-by: ericharper * use import guard Signed-off-by: ericharper * update calls to fake init in eval scripts Signed-off-by: ericharper * add _get_fwd_bwd_function Signed-off-by: ericharper * log all total model parameters Signed-off-by: ericharper * remove unused import Signed-off-by: ericharper Signed-off-by: ericharper Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Hainan Xu * reduced to 14 inactive days to be stale for PRs. (#5165) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Hainan Xu * refactor TTS documentation organization and add new contents. (#5137) * refactor TTS documentation organization and add new contents. * fix asr api bug. * fix broken links. * fix unexpected indentation errors. * fixed unexpected indentation. * fixed broken paper reference. * fixed cross-reference and typos. * fixed toctree errors. * revert to 'Augmentors' * reordered TTS tutorial list in starthere. * ordered api classes alphabetically for each Section. * fixed underscore typo for fastpitch checkpoint. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * upcase 'Tuning' Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * fixed typo for RAD-TTS Aligner Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * reorder aligner section after mel-gen and vocoders in models.rst. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * clarify Mixer-TTS-X and reorder model descriptions alphabetically. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * fixed some typos and formats. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * removed old megatron.rst. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * fixed block quote ends without a blank line warnings. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * remove duplicate reference; fixed missing key nlp-megatron-shoeybi2019megatron Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * Revert "removed old megatron.rst." This reverts commit c5ea1dc3f23272eecfe8040e3abfa54fa122cf73. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * removed Russian, a hyphen, and add a note about G2P in tts/config.rst Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * added pynini installation in wfst_text_normalization.rst Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * added description of manifest key/value pairs. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * add toctree in tts/intro Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * replace main branch to stable. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * add 'upcoming' for e2e systems. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * replaced main branch to stabl… * Multiblank Transducer (#5527) * multi-blank transducers Signed-off-by: Hainan Xu * one line bug fix Signed-off-by: Hainan Xu * change interface of RNNTDecoding class to extract num-extra-output from joint instead of constructor Signed-off-by: Hainan Xu * addressed PR comments Signed-off-by: Hainan Xu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Hainan Xu Co-authored-by: Hainan Xu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Default RNNT loss to int64 targets (#6011) Signed-off-by: smajumdar * Rebase Signed-off-by: smajumdar * Begin refactoring tests Signed-off-by: smajumdar * Pass all tests for RNNT numba loss Signed-off-by: smajumdar * Pass all tests for RNNT numba loss Signed-off-by: smajumdar * Remove print Signed-off-by: smajumdar * Fix test for version Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revert bad merges Signed-off-by: smajumdar * Revert bad merges Signed-off-by: smajumdar * Address comments Signed-off-by: smajumdar * Remove wrong file Signed-off-by: smajumdar --------- Signed-off-by: smajumdar Signed-off-by: Vahid Signed-off-by: Hainan Xu Signed-off-by: arendu Signed-off-by: shanmugamr1992 Signed-off-by: Matvei Novikov Signed-off-by: Anas Signed-off-by: nithinraok Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: MaximumEntropy Signed-off-by: Virginia Adams Signed-off-by: smajumdar Signed-off-by: David Mosallanezhad Signed-off-by: Elena Rastorgueva Signed-off-by: Aleksandr Laptev Signed-off-by: stevehuang52 Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Signed-off-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com> Signed-off-by: Ante Jukić Signed-off-by: ekmb Signed-off-by: Ryan Signed-off-by: Dima Rekesh Signed-off-by: Dima Rekesh Signed-off-by: Oleksii Kuchaiev Signed-off-by: ericharper Signed-off-by: Virginia Adams Signed-off-by: Yang Zhang Signed-off-by: Zhilin Wang Signed-off-by: Virginia Adams <78445382+vadam5@users.noreply.github.com> Signed-off-by: Igor Gitman Signed-off-by: SeanNaren Signed-off-by: fayejf Signed-off-by: George Zelenfroynd Signed-off-by: Abhinav Khattar Signed-off-by: Yi Dong Signed-off-by: Yi Dong Signed-off-by: Paarth Neekhara Signed-off-by: Micha Livne Signed-off-by: Jin Li Signed-off-by: subhankar-ghosh Signed-off-by: Taejin Park Signed-off-by: Miguel Martínez Signed-off-by: miguelangel Signed-off-by: Jocelyn Huang Signed-off-by: 彭震东 <275331498@qq.com> Signed-off-by: Alexandra Antonova Signed-off-by: Jason Signed-off-by: Patrick Simianer Signed-off-by: Shantanu Acharya Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Signed-off-by: Tim Moon Signed-off-by: eharper Signed-off-by: Micha Livne Signed-off-by: Oleksii Volkovskyi Signed-off-by: Yuekai Zhang Signed-off-by: Boris Fomitchev Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com> Signed-off-by: whrichd Signed-off-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> Signed-off-by: Vladimir Bataev Signed-off-by: Viraj Karandikar Signed-off-by: Yu Yao Signed-off-by: PeganovAnton Signed-off-by: Somshubra Majumdar Signed-off-by: Jonghwan Hyeon Signed-off-by: Boris Fomitchev Signed-off-by: shane carroll Co-authored-by: Samuel Kriman Co-authored-by: Hainan Xu Co-authored-by: Vahid Noroozi Co-authored-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Co-authored-by: Sandeep Subramanian Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Matvei Novikov Co-authored-by: Anas Abou Allaban Co-authored-by: Nithin Rao Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Virginia Adams <78445382+vadam5@users.noreply.github.com> Co-authored-by: Eric Harper Co-authored-by: Zhilin Wang Co-authored-by: David Co-authored-by: David Mosallanezhad Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> Co-authored-by: Aleksandr Laptev Co-authored-by: Aleksandr Laptev Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: Jagadeesh Balam <4916480+jbalam-nv@users.noreply.github.com> Co-authored-by: anteju <108555623+anteju@users.noreply.github.com> Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> Co-authored-by: Ryan Langman Co-authored-by: Dima Rekesh Co-authored-by: Dima Rekesh Co-authored-by: Oleksii Kuchaiev Co-authored-by: Oleksii Kuchaiev Co-authored-by: Virginia Adams Co-authored-by: Virginia Adams Co-authored-by: Yang Zhang Co-authored-by: Zhilin Wang Co-authored-by: Igor Gitman Co-authored-by: Sean Naren Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com> Co-authored-by: Abhinav Khattar Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com> Co-authored-by: Yi Dong Co-authored-by: Paarth Neekhara Co-authored-by: Jocelyn Co-authored-by: Micha Livne Co-authored-by: liji-nv <59594262+liji-nv@users.noreply.github.com> Co-authored-by: Subhankar Ghosh Co-authored-by: Taejin Park Co-authored-by: Miguel Martínez <26169771+miguelusque@users.noreply.github.com> Co-authored-by: miguelangel Co-authored-by: 彭震东 <275331498@qq.com> Co-authored-by: Igor Gitman Co-authored-by: bene-ges <61418381+bene-ges@users.noreply.github.com> Co-authored-by: Alexandra Antonova Co-authored-by: Jason Co-authored-by: Rajesh Ilango Co-authored-by: pks Co-authored-by: Shantanu Acharya Co-authored-by: Shantanu Acharya Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: Sangkug Lym Co-authored-by: Kirthi Shankar Sivamani Co-authored-by: ksivamani Co-authored-by: Eric Harper Co-authored-by: Sandeep Subramanian Co-authored-by: Micha Livne Co-authored-by: Oleksii Volkovskyi Co-authored-by: Yuekai Zhang Co-authored-by: Hainan Xu Co-authored-by: Boris Fomitchev Co-authored-by: anmolgupt <14880251+anmolgupt@users.noreply.github.com> Co-authored-by: Anmol Gupta Co-authored-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com> Co-authored-by: Riqiang Wang <43883260+whrichd@users.noreply.github.com> Co-authored-by: Vladimir Bataev Co-authored-by: Shanmugam Ramasamy Co-authored-by: Viraj Karandikar <16838694+virajkarandikar@users.noreply.github.com> Co-authored-by: Shane Carroll <50530592+1-800-BAD-CODE@users.noreply.github.com> Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Co-authored-by: Yi Dong Co-authored-by: PeganovAnton Co-authored-by: Jonghwan Hyeon Co-authored-by: Kaden Uhlig Co-authored-by: Kaden Uhlig Co-authored-by: Boris Fomitchev Co-authored-by: Jonghwan Hyeon --- nemo/collections/asr/losses/rnnt.py | 26 +++- nemo/collections/asr/losses/rnnt_pytorch.py | 5 + .../asr/parts/numba/rnnt_loss/rnnt.py | 2 +- .../asr/parts/numba/rnnt_loss/rnnt_numpy.py | 5 + .../asr/parts/numba/rnnt_loss/rnnt_pytorch.py | 7 +- .../rnnt_loss/utils/cpu_utils/cpu_rnnt.py | 8 +- .../numba/rnnt_loss/utils/rnnt_helper.py | 3 +- nemo/core/utils/numba_utils.py | 36 +++++ .../asr/numba/rnnt_loss/test_rnnt_pytorch.py | 126 ++++++++++++------ .../rnnt_loss/utils/test_gpu_rnnt_kernel.py | 64 +++++---- .../asr/numba/rnnt_loss/utils/test_reduce.py | 18 ++- .../numba/rnnt_loss/utils/test_rnnt_helper.py | 75 +++++++---- 12 files changed, 263 insertions(+), 112 deletions(-) diff --git a/nemo/collections/asr/losses/rnnt.py b/nemo/collections/asr/losses/rnnt.py index 10b85acb42ef..a884f7d3cc68 100644 --- a/nemo/collections/asr/losses/rnnt.py +++ b/nemo/collections/asr/losses/rnnt.py @@ -38,9 +38,10 @@ from nemo.collections.asr.losses.rnnt_pytorch import MultiblankRNNTLossPytorch, RNNTLossPytorch, TDTLossPytorch from nemo.core.classes import Loss, typecheck from nemo.core.neural_types import LabelsType, LengthsType, LogprobsType, LossType, NeuralType +from nemo.core.utils import numba_utils from nemo.core.utils.k2_utils import K2_INSTALLATION_MESSAGE from nemo.core.utils.numba_utils import NUMBA_INSTALLATION_MESSAGE -from nemo.utils import logging, model_utils +from nemo.utils import logging, logging_mode, model_utils try: import warprnnt_pytorch as warprnnt @@ -98,7 +99,7 @@ class RNNTLossConfig: min_version='0.53.0', is_available=NUMBA_RNNT_AVAILABLE, installation_msg=NUMBA_INSTALLATION_MESSAGE, - force_float32=True, + force_float32=not numba_utils.NUMBA_FP16_SUPPORTED, ), "pytorch": RNNTLossConfig( loss_name="pytorch", @@ -387,7 +388,7 @@ def __init__(self, num_classes, reduction: str = 'mean_batch', loss_name: str = for the standard "blank" symbol. In particular, say V is the number of non-blank tokens in the vocabulary, then in the case of, standard RNNT: num_classes = V - multiblank RNNT: num_classes = V + number-big-blanks (since we store big-blanks before + multiblank RNNT: num_classes = V + number-big-blanks (since we store big-blanks before standard blank, and the standard blank is the last symbol in the vocab) TDT: num_classes = V. Note, V here does not include any of the "duration outputs". @@ -413,6 +414,7 @@ def __init__(self, num_classes, reduction: str = 'mean_batch', loss_name: str = self.reduction = reduction self._loss = resolve_rnnt_loss(loss_name, blank_idx=self._blank, loss_kwargs=loss_kwargs) self._force_float32 = RNNT_LOSS_RESOLVER[loss_name].force_float32 + self._fp16_compat_checked = False def reduce(self, losses, target_lengths): @@ -442,8 +444,22 @@ def forward(self, log_probs, targets, input_lengths, target_lengths): max_targets_len = target_lengths.max() # Force cast joint to float32 - # TODO: Remove once Numba supports FP16 - if self._force_float32 and log_probs.dtype != torch.float32: + if not self._force_float32 and numba_utils.NUMBA_FP16_SUPPORTED: + # Execute the kernel in fp16 + pass + elif self._force_float32 and log_probs.dtype != torch.float32: + # Log just once if fp16 tensor was passed and fp16 Numba CUDA loss could not be used. + if log_probs.dtype == torch.float16 and not self._fp16_compat_checked: + _, reason = numba_utils.is_numba_cuda_fp16_supported(return_reason=True) + logging.warning( + f"Provided RNNT Joint tensor is of dtype {log_probs.dtype}, but RNNT loss could not be calculated " + f"in fp16 due to following reason stated below. Loss will be calculated in fp32. \n\n" + f"{reason}", + mode=logging_mode.ONCE, + ) + self._fp16_compat_checked = True + + # Upcast the activation tensor and compute loss and grads in fp32 logits_orig = log_probs log_probs = log_probs.float() del logits_orig # save memory *before* computing the loss diff --git a/nemo/collections/asr/losses/rnnt_pytorch.py b/nemo/collections/asr/losses/rnnt_pytorch.py index bc6e5a25a3b2..c8eee90a2eb5 100644 --- a/nemo/collections/asr/losses/rnnt_pytorch.py +++ b/nemo/collections/asr/losses/rnnt_pytorch.py @@ -47,7 +47,12 @@ def __init__(self, blank, reduction): self.reduction = reduction def forward(self, acts, labels, act_lens, label_lens): + # CPU patch for FP16 + if not acts.is_cuda and acts.dtype == torch.float16: + acts = acts.float() + acts = torch.log_softmax(acts, -1) + forward_logprob = self.compute_forward_prob(acts, labels, act_lens, label_lens) losses = -forward_logprob if self.reduction == 'mean_batch': diff --git a/nemo/collections/asr/parts/numba/rnnt_loss/rnnt.py b/nemo/collections/asr/parts/numba/rnnt_loss/rnnt.py index 118ee88acbfe..046aea425e20 100644 --- a/nemo/collections/asr/parts/numba/rnnt_loss/rnnt.py +++ b/nemo/collections/asr/parts/numba/rnnt_loss/rnnt.py @@ -186,7 +186,7 @@ def rnnt_loss_gpu( # Select GPU index cuda.select_device(acts.device.index) - gpu_workspace = torch.zeros(gpu_size, device=acts.device, dtype=acts.dtype, requires_grad=False) + gpu_workspace = torch.zeros(gpu_size, device=acts.device, dtype=torch.float32, requires_grad=False) ### VIEW TENSORS AS VECTORS FOR POINTER INDEXING ### acts, acts_shape = rnnt_helper.flatten_tensor(acts) diff --git a/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_numpy.py b/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_numpy.py index eaa6d332a0fc..58508970aa83 100644 --- a/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_numpy.py +++ b/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_numpy.py @@ -344,10 +344,15 @@ def forward(self, acts, labels, act_lens, label_lens): _assert_no_grad(label_lens) certify_inputs(acts, labels, act_lens, label_lens) + # CPU Patch for fp16 - force cast to fp32 + if not acts.is_cuda and acts.dtype == torch.float16: + acts = acts.float() + if self.clamp > 0.0: acts = LogSoftmaxGradModification.apply(acts, self.clamp) acts = torch.nn.functional.log_softmax(acts, -1) + return self.rnnt(acts, labels, act_lens, label_lens, self.blank, self.fastemit_lambda) diff --git a/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_pytorch.py b/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_pytorch.py index 2ffe08be361e..5960d5ab6b18 100644 --- a/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_pytorch.py +++ b/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_pytorch.py @@ -57,7 +57,7 @@ def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction, fastemit_ loss_func = rnnt.rnnt_loss_gpu if is_cuda else rnnt.rnnt_loss_cpu grads = torch.zeros_like(acts) if acts.requires_grad else None minibatch_size = acts.size(0) - costs = torch.zeros(minibatch_size, device=acts.device, dtype=acts.dtype) + costs = torch.zeros(minibatch_size, device=acts.device, dtype=torch.float32) loss_func( acts, @@ -119,7 +119,6 @@ def forward( label_lens: Tensor of (batch) containing label length of each example fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - durations: list of durations for TDT model, must include 0 and 1, e.g. [0, 1, 2, 3, 4]. sigma: hyper-parameter for logit under-normalization method for training @@ -417,6 +416,10 @@ def forward(self, acts, labels, act_lens, label_lens): label_lens: Tensor of (batch) containing label length of each example """ if not acts.is_cuda: + # Force FP32 until log_softmax() is implemented for fp16 on CPU + if acts.dtype == torch.float16: + acts = acts.float() + # Since CPU requires log_softmax to be computed explicitly, we need to perform grad clipping # *after* we have obtained the gradients of loss(logsoftmax()). # This is highly wasteful since it requires a copy of the entire joint tensor which is expensive. diff --git a/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/cpu_rnnt.py b/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/cpu_rnnt.py index 1528606716e1..3feb7b513a50 100644 --- a/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/cpu_rnnt.py +++ b/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/cpu_rnnt.py @@ -231,8 +231,8 @@ def cost_and_grad_kernel( ) # Scale llForward by FastEmit lambda - llForward *= 1.0 + self.fastemit_lambda_ - llBackward *= 1.0 + self.fastemit_lambda_ + llForward += llForward * self.fastemit_lambda_ + llBackward += llBackward * self.fastemit_lambda_ diff = (llForward - llBackward).abs() if diff > 0.1: @@ -300,6 +300,10 @@ def compute_betas_and_grads( Returns: Loglikelihood of the forward variable and inplace updates the grad tensor. """ + # Patch for CPU + fp16 + if log_probs.dtype == torch.float16 and not log_probs.is_cuda: + log_probs = log_probs.float() + idx = CpuRNNT_index(U, self.maxU_, self.minibatch_, self.alphabet_size_, self.batch_first) betas[idx(T - 1, U - 1)] = log_probs[idx(T - 1, U - 1) * 2] diff --git a/nemo/collections/asr/parts/numba/rnnt_loss/utils/rnnt_helper.py b/nemo/collections/asr/parts/numba/rnnt_loss/utils/rnnt_helper.py index b579b7315ef2..6ca7cd237264 100644 --- a/nemo/collections/asr/parts/numba/rnnt_loss/utils/rnnt_helper.py +++ b/nemo/collections/asr/parts/numba/rnnt_loss/utils/rnnt_helper.py @@ -30,6 +30,7 @@ import math from typing import Optional, Tuple +import numba import torch from numba import cuda @@ -112,7 +113,7 @@ def compute_costs_data(source: torch.Tensor, dest: torch.Tensor, fastemit_lambda if idx < length: copy_data_1d(source, dest, idx) dest[idx] *= -1.0 - dest[idx] *= 1.0 + fastemit_lambda + dest[idx] *= numba.float32(1.0 + fastemit_lambda) def get_workspace_size( diff --git a/nemo/core/utils/numba_utils.py b/nemo/core/utils/numba_utils.py index 6e1a8cb247d6..04010a2f7db4 100644 --- a/nemo/core/utils/numba_utils.py +++ b/nemo/core/utils/numba_utils.py @@ -17,6 +17,8 @@ import operator import os +from typing import Tuple, Union + from nemo.utils import model_utils # Prevent Numba CUDA logs from showing at info level @@ -26,6 +28,11 @@ __NUMBA_DEFAULT_MINIMUM_VERSION__ = "0.53.0" __NUMBA_MINIMUM_VERSION__ = os.environ.get("NEMO_NUMBA_MINVER", __NUMBA_DEFAULT_MINIMUM_VERSION__) +__NUMBA_MINIMUM_VERSION_FP16_SUPPORTED__ = "0.57.0" +NUMBA_FP16_SUPPORTED = model_utils.check_lib_version( + 'numba', __NUMBA_MINIMUM_VERSION_FP16_SUPPORTED__, operator=operator.ge +)[0] + NUMBA_INSTALLATION_MESSAGE = ( "Could not import `numba`.\n" @@ -148,6 +155,35 @@ def numba_cuda_is_supported(min_version: str) -> bool: return False +def is_numba_cuda_fp16_supported(return_reason: bool = False) -> Union[bool, Tuple[bool, str]]: + """ + Utility method that returns a bool, stating if FP16 is supported for numba cuda kernels or not. + + Returns: + bool, whether Numba CUDA will support fp16 or not. + """ + reason = "" + use_nvidia_binding = os.environ.get('NUMBA_CUDA_USE_NVIDIA_BINDING', None) + if use_nvidia_binding is not None: + use_nvidia_binding = use_nvidia_binding.lower() == "1" + reason += "Env variable `NUMBA_CUDA_USE_NVIDIA_BINDING` is available and set to `1`. " + else: + use_nvidia_binding = False + reason += "Env variable `NUMBA_CUDA_USE_NVIDIA_BINDING` is not available or has not set to `1`." + + if NUMBA_FP16_SUPPORTED: + reason += f"Numba CUDA FP16 is supported in installed numba version." + else: + reason += f"Numba CUDA FP16 is not supported in installed numba version." + + result = use_nvidia_binding and NUMBA_FP16_SUPPORTED + + if return_reason: + return result, reason + else: + return result + + def skip_numba_cuda_test_if_unsupported(min_version: str): """ Helper method to skip pytest test case if numba cuda is not supported. diff --git a/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py b/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py index 3fbfcf6df54b..1a29a14f540d 100644 --- a/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py +++ b/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py @@ -34,9 +34,14 @@ DEVICES.append('cuda') +DTYPES = [np.float32] +if numba_utils.is_numba_cuda_fp16_supported(): + DTYPES.append(np.float16) + + def wrap_and_call(fn, acts, labels, device): if not torch.is_tensor(acts): - acts = torch.FloatTensor(acts) + acts = torch.tensor(acts) if 'cuda' in device: acts = acts.cuda() @@ -72,7 +77,8 @@ def wrap_and_call(fn, acts, labels, device): class TestRNNTLossPytorch: @pytest.mark.unit @pytest.mark.parametrize('device', DEVICES) - def test_case_small(self, device): + @pytest.mark.parametrize('dtype', DTYPES) + def test_case_small(self, device, dtype): if device == 'cuda': numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) @@ -83,9 +89,13 @@ def test_case_small(self, device): [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.2, 0.1, 0.1], [0.7, 0.1, 0.2, 0.1, 0.1]], ] ] - ) + ).astype(dtype) labels = [[1, 2]] + cost_threshold = 1e-8 if dtype == np.float32 else 5e-4 + grad_threshold = 1e-8 if dtype == np.float32 else 1e-4 + rtol = 1e-5 if dtype == np.float32 else 1e-3 + fn_pt = RNNTLossNumba(blank=0, reduction='sum') pt_cost, pt_grads = wrap_and_call(fn_pt, acts, labels, device) @@ -113,23 +123,28 @@ def test_case_small(self, device): ] ) - assert np.allclose(pt_cost, expected_cost, rtol=1e-6), "small_test costs mismatch." - assert np.allclose(pt_grads, expected_grads), "small_test gradient mismatch." + assert np.allclose(pt_cost, expected_cost, atol=cost_threshold, rtol=1e-6), "small_test costs mismatch." + assert np.allclose(pt_grads, expected_grads, atol=grad_threshold, rtol=rtol), "small_test gradient mismatch." - assert np.allclose(pt_cost, np_cost, rtol=1e-6), "small_test costs mismatch." - assert np.allclose(pt_grads, np_grads), "small_test gradient mismatch." + assert np.allclose(pt_cost, np_cost, atol=cost_threshold, rtol=rtol), "small_test costs mismatch." + assert np.allclose(pt_grads, np_grads, atol=grad_threshold, rtol=rtol), "small_test gradient mismatch." - assert np.allclose(ag_cost, np_cost, rtol=1e-6), "small_test costs mismatch." - assert np.allclose(ag_grads, np_grads), "small_test gradient mismatch." + assert np.allclose(ag_cost, np_cost, atol=cost_threshold, rtol=rtol), "small_test costs mismatch." + assert np.allclose(ag_grads, np_grads, atol=cost_threshold, rtol=rtol), "small_test gradient mismatch." @pytest.mark.unit @pytest.mark.parametrize('device', DEVICES) - def test_case_small_random(self, device): + @pytest.mark.parametrize('dtype', DTYPES) + def test_case_small_random(self, device, dtype): if device == 'cuda': numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) + cost_threshold = 1e-8 if dtype == np.float32 else 5e-4 + grad_threshold = 1e-8 if dtype == np.float32 else 1e-4 + rtol = 1e-5 if dtype == np.float32 else 1e-3 + rng = np.random.RandomState(0) - acts = rng.randn(1, 4, 3, 3) + acts = rng.randn(1, 4, 3, 3).astype(dtype) labels = [[1, 2]] fn_pt = RNNTLossNumba(blank=0, reduction='sum') @@ -141,16 +156,17 @@ def test_case_small_random(self, device): fn_ag = RNNTLossPytorch(blank=0, reduction='sum') # ag for automatic gradient computation ag_cost, ag_grads = wrap_and_call(fn_ag, acts, labels, device) - assert np.allclose(pt_cost, np_cost, rtol=1e-6), "small_random_test costs mismatch." - assert np.allclose(pt_grads, np_grads), "small_random_test gradient mismatch." + assert np.allclose(pt_cost, np_cost, atol=cost_threshold, rtol=rtol), "small_random_test costs mismatch." + assert np.allclose(pt_grads, np_grads, atol=grad_threshold, rtol=rtol), "small_random_test gradient mismatch." - assert np.allclose(pt_cost, ag_cost, rtol=1e-6), "small_random_test costs mismatch." - assert np.allclose(pt_grads, ag_grads), "small_random_test gradient mismatch." + assert np.allclose(pt_cost, ag_cost, atol=cost_threshold, rtol=rtol), "small_random_test costs mismatch." + assert np.allclose(pt_grads, ag_grads, atol=grad_threshold, rtol=rtol), "small_random_test gradient mismatch." @pytest.mark.unit @pytest.mark.parametrize('device', DEVICES) + @pytest.mark.parametrize('dtype', DTYPES) @pytest.mark.parametrize('fastemit_lambda', [1.0, 0.01, 0.00001]) - def test_case_small_random_fastemit_reg(self, device, fastemit_lambda): + def test_case_small_random_fastemit_reg(self, device, dtype, fastemit_lambda): if device == 'cuda': numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) @@ -165,11 +181,12 @@ def test_case_small_random_fastemit_reg(self, device, fastemit_lambda): np_cost, np_grads = wrap_and_call(fn_np, acts, labels, device) assert np.allclose(pt_cost, np_cost, rtol=1e-6), "small_random_test costs mismatch." - assert np.allclose(pt_grads, np_grads, atol=1e-5, rtol=1e-5), "small_random_test gradient mismatch." + assert np.allclose(pt_grads, np_grads, rtol=1e-5), "small_random_test gradient mismatch." @pytest.mark.unit @pytest.mark.parametrize('device', DEVICES) - def test_case_big_tensor(self, device): + @pytest.mark.parametrize('dtype', DTYPES) + def test_case_big_tensor(self, device, dtype): if device == 'cuda': numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) @@ -269,9 +286,13 @@ def test_case_big_tensor(self, device): ], ] - activations = np.array(activations) + activations = np.array(activations).astype(dtype) labels = [[1, 2], [1, 1]] + cost_threshold = 1e-8 if dtype == np.float32 else 5e-4 + grad_threshold = 1e-8 if dtype == np.float32 else 1e-4 + rtol = 1e-3 if dtype == np.float32 else 0.1 + fn_pt = RNNTLossNumba(blank=0, reduction='sum') pt_costs, pt_grads = wrap_and_call(fn_pt, activations, labels, device) @@ -281,23 +302,30 @@ def test_case_big_tensor(self, device): fn_ag = RNNTLossPytorch(blank=0, reduction='sum') ag_costs, ag_grads = wrap_and_call(fn_ag, activations, labels, device) - assert np.allclose(pt_costs, sum(expected_costs)), "big_test average costs mismatch." - assert np.allclose(pt_grads, expected_grads, rtol=1e-3), "big_test grads for average cost mismatch." + assert np.allclose(pt_costs, sum(expected_costs), atol=cost_threshold), "big_test average costs mismatch." + assert np.allclose( + pt_grads, expected_grads, atol=grad_threshold, rtol=1e-3 + ), "big_test grads for average cost mismatch." - assert np.allclose(pt_costs, np_costs), "big_test average costs mismatch." - assert np.allclose(pt_grads, np_grads, rtol=1e-3), "big_test grads for average cost mismatch." + assert np.allclose(pt_costs, np_costs, atol=cost_threshold, rtol=rtol), "big_test average costs mismatch." + assert np.allclose( + pt_grads, np_grads, atol=grad_threshold, rtol=rtol + ), "big_test grads for average cost mismatch." - assert np.allclose(pt_costs, ag_costs), "big_test average costs mismatch." - assert np.allclose(pt_grads, ag_grads, rtol=1e-3), "big_test grads for average cost mismatch." + assert np.allclose(pt_costs, ag_costs, atol=cost_threshold, rtol=rtol), "big_test average costs mismatch." + assert np.allclose( + pt_grads, ag_grads, atol=grad_threshold, rtol=rtol + ), "big_test grads for average cost mismatch." @pytest.mark.unit @pytest.mark.parametrize('device', DEVICES) - def test_case_large_random(self, device): + @pytest.mark.parametrize('dtype', DTYPES) + def test_case_large_random(self, device, dtype): if device == 'cuda': numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) rng = np.random.RandomState(0) - acts = rng.randn(4, 8, 11, 5) + acts = rng.randn(4, 8, 11, 5).astype(dtype) labels = [ [1, 2, 4, 3, 2, 2, 1, 1, 1, 1], [3, 2, 2, 3, 4, 1, 1, 1, 1, 1], @@ -305,6 +333,10 @@ def test_case_large_random(self, device): [1, 1, 2, 1, 2, 3, 3, 1, 1, 1], ] + cost_threshold = 1e-8 if dtype == np.float32 else 5e-4 + grad_threshold = 1e-8 if dtype == np.float32 else 1e-4 + rtol = 1e-3 if dtype == np.float32 else 5e-2 + fn_pt = RNNTLossNumba(blank=0, reduction='sum') pt_cost, pt_grads = wrap_and_call(fn_pt, acts, labels, device) @@ -314,14 +346,15 @@ def test_case_large_random(self, device): fn_ag = RNNTLossPytorch(blank=0, reduction='sum') ag_cost, ag_grads = wrap_and_call(fn_ag, acts, labels, device) - assert np.allclose(pt_cost, np_cost, atol=1e-5, rtol=1e-3), "large_random_test costs mismatch." - assert np.allclose(ag_cost, np_cost, atol=1e-5, rtol=1e-3), "large_random_test costs mismatch." - assert np.allclose(pt_grads, np_grads, atol=1e-5, rtol=1e-3), "large_random_test gradient mismatch." - assert np.allclose(ag_grads, np_grads, atol=1e-5, rtol=1e-3), "large_random_test gradient mismatch." + assert np.allclose(pt_cost, np_cost, atol=cost_threshold, rtol=rtol), "large_random_test costs mismatch." + assert np.allclose(ag_cost, np_cost, atol=cost_threshold, rtol=rtol), "large_random_test costs mismatch." + assert np.allclose(pt_grads, np_grads, atol=grad_threshold, rtol=rtol), "large_random_test gradient mismatch." + assert np.allclose(ag_grads, np_grads, atol=grad_threshold, rtol=rtol), "large_random_test gradient mismatch." @pytest.mark.unit @pytest.mark.parametrize('device', DEVICES) - def test_case_small_clamp(self, device): + @pytest.mark.parametrize('dtype', DTYPES) + def test_case_small_clamp(self, device, dtype): if device == 'cuda': numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) @@ -333,9 +366,13 @@ def test_case_small_clamp(self, device): [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.2, 0.1, 0.1], [0.7, 0.1, 0.2, 0.1, 0.1]], ] ] - ) + ).astype(dtype) labels = [[1, 2]] + cost_threshold = 1e-8 if dtype == np.float32 else 5e-4 + grad_threshold = 1e-8 if dtype == np.float32 else 5e-5 + rtol = 1e-5 if dtype == np.float32 else 1e-3 + fn_pt = RNNTLossNumba(blank=0, reduction='sum', clamp=GRAD_CLAMP) pt_cost, pt_grads = wrap_and_call(fn_pt, acts, labels, device) @@ -360,16 +397,17 @@ def test_case_small_clamp(self, device): ] ) - assert np.allclose(pt_cost, expected_cost, rtol=1e-6), "small_test costs mismatch." - assert np.allclose(pt_grads, expected_grads), "small_test gradient mismatch." + assert np.allclose(pt_cost, expected_cost, atol=cost_threshold, rtol=rtol), "small_test costs mismatch." + assert np.allclose(pt_grads, expected_grads, atol=grad_threshold, rtol=rtol), "small_test gradient mismatch." - assert np.allclose(pt_cost, np_cost, rtol=1e-6), "small_test costs mismatch." - assert np.allclose(pt_grads, np_grads), "small_test gradient mismatch." + assert np.allclose(pt_cost, np_cost, atol=cost_threshold, rtol=rtol), "small_test costs mismatch." + assert np.allclose(pt_grads, np_grads, atol=grad_threshold, rtol=rtol), "small_test gradient mismatch." @pytest.mark.unit @pytest.mark.parametrize('device', DEVICES) + @pytest.mark.parametrize('dtype', DTYPES) @pytest.mark.parametrize('fastemit_lambda', [1.0, 0.01, 0.00001]) - def test_case_small_fastemit_clamp(self, device, fastemit_lambda): + def test_case_small_fastemit_clamp(self, device, dtype, fastemit_lambda): if device == 'cuda': numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) @@ -381,9 +419,13 @@ def test_case_small_fastemit_clamp(self, device, fastemit_lambda): [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.2, 0.1, 0.1], [0.7, 0.1, 0.2, 0.1, 0.1]], ] ] - ) + ).astype(dtype) labels = [[1, 2]] + cost_threshold = 1e-8 if dtype == np.float32 else 1e-3 + grad_threshold = 1e-8 if dtype == np.float32 else 5e-4 + rtol = 1e-5 if dtype == np.float32 else 1e-3 + fn_pt = RNNTLossNumba(blank=0, reduction='sum', fastemit_lambda=fastemit_lambda, clamp=GRAD_CLAMP) pt_cost, pt_grads = wrap_and_call(fn_pt, acts, labels, device) @@ -393,9 +435,9 @@ def test_case_small_fastemit_clamp(self, device, fastemit_lambda): expected_cost = 4.495666 expected_cost += expected_cost * fastemit_lambda - assert np.allclose(pt_cost, expected_cost, rtol=1e-6), "small_test costs mismatch." - assert np.allclose(pt_cost, np_cost, rtol=1e-6), "small_test costs mismatch." - assert np.allclose(pt_grads, np_grads), "small_test gradient mismatch." + assert np.allclose(pt_cost, expected_cost, atol=cost_threshold, rtol=rtol), "small_test costs mismatch." + assert np.allclose(pt_cost, np_cost, atol=cost_threshold, rtol=rtol), "small_test costs mismatch." + assert np.allclose(pt_grads, np_grads, atol=grad_threshold, rtol=rtol), "small_test gradient mismatch." @pytest.mark.unit @pytest.mark.parametrize('device', DEVICES) diff --git a/tests/collections/asr/numba/rnnt_loss/utils/test_gpu_rnnt_kernel.py b/tests/collections/asr/numba/rnnt_loss/utils/test_gpu_rnnt_kernel.py index 230b6b7c099f..cb5a9816e237 100644 --- a/tests/collections/asr/numba/rnnt_loss/utils/test_gpu_rnnt_kernel.py +++ b/tests/collections/asr/numba/rnnt_loss/utils/test_gpu_rnnt_kernel.py @@ -25,8 +25,14 @@ from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__ +DTYPES = [torch.float32] +if numba_utils.is_numba_cuda_fp16_supported(): + DTYPES.append(torch.float16) + + def log_softmax(x, axis=-1): x = torch.from_numpy(x) # zero-copy + x = x.float() x = torch.log_softmax(x, dim=axis) x = x.numpy() return x @@ -42,12 +48,14 @@ def log_softmax_grad(x, axis=-1): class TestRNNTCUDAKernels: @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Reductions can only be run when CUDA is available") @pytest.mark.unit - def test_compute_alphas_kernel(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_compute_alphas_kernel(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) random = np.random.RandomState(0) original_shape = [1, 5, 11, 3] B, T, U, V = original_shape + threshold = 1e-5 if dtype == torch.float32 else 3e-4 # Numpy kernel x = random.randn(*original_shape) @@ -67,7 +75,7 @@ def test_compute_alphas_kernel(self): else: stream = cuda.default_stream() - x_c = torch.tensor(x, device=device, dtype=torch.float32) + x_c = torch.tensor(x, device=device, dtype=dtype) labels_c = torch.tensor(labels, device=device, dtype=torch.int64) # Allocate workspace memory @@ -100,22 +108,24 @@ def test_compute_alphas_kernel(self): alphas = alphas.view([B, T, U]) diff = ground_alphas - alphas[0].cpu().numpy() - assert np.abs(diff).mean() <= 1e-5 - assert np.square(diff).mean() <= 1e-10 + assert np.abs(diff).mean() <= threshold + assert np.square(diff).mean() <= (threshold ** 2) ll_diff = ground_log_likelihood - llForward[0].cpu().numpy() - assert np.abs(ll_diff).mean() <= 1e-5 - assert np.square(ll_diff).mean() <= 1e-10 + assert np.abs(ll_diff).mean() <= threshold + assert np.square(ll_diff).mean() <= (threshold ** 2) @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Reductions can only be run when CUDA is available") @pytest.mark.unit - def test_compute_betas_kernel(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_compute_betas_kernel(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) random = np.random.RandomState(0) original_shape = [1, 5, 11, 3] B, T, U, V = original_shape + threshold = 1e-5 if dtype == torch.float32 else 3e-4 # Numpy kernel x = random.randn(*original_shape) @@ -135,7 +145,7 @@ def test_compute_betas_kernel(self): else: stream = cuda.default_stream() - x_c = torch.tensor(x, device=device, dtype=torch.float32) + x_c = torch.tensor(x, device=device, dtype=dtype) labels_c = torch.tensor(labels, device=device, dtype=torch.int64) # Allocate workspace memory @@ -168,17 +178,18 @@ def test_compute_betas_kernel(self): betas = betas.view([B, T, U]) diff = ground_alphas - betas[0].cpu().numpy() - assert np.abs(diff).mean() <= 1e-5 - assert np.square(diff).mean() <= 1e-10 + assert np.abs(diff).mean() <= threshold + assert np.square(diff).mean() <= (threshold ** 2) ll_diff = ground_log_likelihood - llBackward[0].cpu().numpy() - assert np.abs(ll_diff).mean() <= 1e-5 - assert np.square(ll_diff).mean() <= 1e-10 + assert np.abs(ll_diff).mean() <= threshold + assert np.square(ll_diff).mean() <= (threshold ** 2) @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Reductions can only be run when CUDA is available") @pytest.mark.unit - def test_compute_grads_kernel(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_compute_grads_kernel(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) fastemit_lambda = 0.0 @@ -187,6 +198,7 @@ def test_compute_grads_kernel(self): random = np.random.RandomState(0) original_shape = [1, 5, 11, 3] B, T, U, V = original_shape + threshold = 1e-5 if dtype == torch.float32 else 3e-5 # Numpy kernel x = random.randn(*original_shape) @@ -220,7 +232,7 @@ def test_compute_grads_kernel(self): else: stream = cuda.default_stream() - x_c = torch.tensor(x, device=device, dtype=torch.float32) + x_c = torch.tensor(x, device=device, dtype=dtype) labels_c = labels.clone().to(device=device, dtype=torch.int64) # Allocate workspace memory @@ -283,12 +295,13 @@ def test_compute_grads_kernel(self): grads = grads.view([B, T, U, V]) diff = true_grads - grads[0].cpu().numpy() - assert np.abs(diff).mean() <= 1e-5 - assert np.square(diff).mean() <= 1e-10 + assert np.abs(diff).mean() <= threshold + assert np.square(diff).mean() <= (threshold ** 2) * 5.0 @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Reductions can only be run when CUDA is available") @pytest.mark.unit - def test_compute_grads_kernel_fastemit(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_compute_grads_kernel_fastemit(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) fastemit_lambda = 0.001 @@ -297,6 +310,7 @@ def test_compute_grads_kernel_fastemit(self): random = np.random.RandomState(0) original_shape = [1, 5, 11, 3] B, T, U, V = original_shape + threshold = 1e-5 if dtype == torch.float32 else 3e-5 # Numpy kernel x = random.randn(*original_shape) @@ -330,7 +344,7 @@ def test_compute_grads_kernel_fastemit(self): else: stream = cuda.default_stream() - x_c = torch.tensor(x, device=device, dtype=torch.float32) + x_c = torch.tensor(x, device=device, dtype=dtype) labels_c = labels.clone().to(device=device, dtype=torch.int64) # Allocate workspace memory @@ -393,12 +407,13 @@ def test_compute_grads_kernel_fastemit(self): grads = grads.view([B, T, U, V]) diff = true_grads - grads[0].cpu().numpy() - assert np.abs(diff).mean() <= 1e-5 - assert np.square(diff).mean() <= 1e-10 + assert np.abs(diff).mean() <= threshold + assert np.square(diff).mean() <= (threshold ** 2) * 5 @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Reductions can only be run when CUDA is available") @pytest.mark.unit - def test_compute_grads_kernel_clamp(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_compute_grads_kernel_clamp(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) fastemit_lambda = 0.0 @@ -407,6 +422,7 @@ def test_compute_grads_kernel_clamp(self): random = np.random.RandomState(0) original_shape = [1, 5, 11, 3] B, T, U, V = original_shape + threshold = 1e-5 if dtype == torch.float32 else 3e-5 # Numpy kernel x = random.randn(*original_shape) @@ -440,7 +456,7 @@ def test_compute_grads_kernel_clamp(self): else: stream = cuda.default_stream() - x_c = torch.tensor(x, device=device, dtype=torch.float32) + x_c = torch.tensor(x, device=device, dtype=dtype) labels_c = labels.clone().to(device=device, dtype=torch.int64) # Allocate workspace memory @@ -503,8 +519,8 @@ def test_compute_grads_kernel_clamp(self): grads = grads.view([B, T, U, V]) diff = true_grads - grads[0].cpu().numpy() - assert np.abs(diff).mean() <= 1e-5 - assert np.square(diff).mean() <= 1e-10 + assert np.abs(diff).mean() <= threshold + assert np.square(diff).mean() <= (threshold ** 2) * 5 class TestTDTCUDAKernels: diff --git a/tests/collections/asr/numba/rnnt_loss/utils/test_reduce.py b/tests/collections/asr/numba/rnnt_loss/utils/test_reduce.py index 7c2ba6a41208..5994d53e1d8f 100644 --- a/tests/collections/asr/numba/rnnt_loss/utils/test_reduce.py +++ b/tests/collections/asr/numba/rnnt_loss/utils/test_reduce.py @@ -20,17 +20,22 @@ from nemo.core.utils import numba_utils from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__ +DTYPES = [np.float32] +if numba_utils.is_numba_cuda_fp16_supported(): + DTYPES.append(np.float16) + class TestRNNTCUDAReductions: @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Reductions can only be run when CUDA is available") @pytest.mark.unit - def test_reduce_max(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_reduce_max(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) random = np.random.RandomState(0) original_shape = [1, 5, 4, 3] - x = random.randn(*original_shape).reshape([-1]) - dx = random.randn(*x.shape) + x = random.randn(*original_shape).reshape([-1]).astype(dtype) + dx = random.randn(*x.shape).astype(dtype) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) @@ -53,13 +58,14 @@ def test_reduce_max(self): @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Reductions can only be run when CUDA is available") @pytest.mark.unit - def test_reduce_exp(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_reduce_exp(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) random = np.random.RandomState(0) original_shape = [1, 5, 4, 2] - x = random.randn(*original_shape).reshape([-1]) - dx = np.zeros_like(x) + x = random.randn(*original_shape).reshape([-1]).astype(dtype) + dx = np.zeros_like(x).astype(dtype) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) diff --git a/tests/collections/asr/numba/rnnt_loss/utils/test_rnnt_helper.py b/tests/collections/asr/numba/rnnt_loss/utils/test_rnnt_helper.py index 243fe727e172..08f12da8324d 100644 --- a/tests/collections/asr/numba/rnnt_loss/utils/test_rnnt_helper.py +++ b/tests/collections/asr/numba/rnnt_loss/utils/test_rnnt_helper.py @@ -20,11 +20,16 @@ from nemo.core.utils import numba_utils from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__ +DTYPES = [np.float32] +if numba_utils.is_numba_cuda_fp16_supported(): + DTYPES.append(np.float16) + class TestRNNTHelper: @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Helpers can only be run when CUDA is available") @pytest.mark.unit - def test_log_sum_exp(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_log_sum_exp(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) # wrapper kernel for device function that is tested @@ -34,8 +39,9 @@ def _kernel(x, y): if x_pos < x.shape[0] and x_pos < y.shape[0]: x[x_pos] = rnnt_helper.log_sum_exp(x[x_pos], y[x_pos]) - x = np.zeros([8]) # np.random.rand(8192) - y = np.ones([8]) # np.random.rand(8192) + x = np.zeros([8]).astype(dtype) # np.random.rand(8192) + y = np.ones([8]).astype(dtype) # np.random.rand(8192) + threshold = 1e-5 if dtype == np.float32 else 2e-3 stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) @@ -52,11 +58,12 @@ def _kernel(x, y): x_new = x_c.copy_to_host(stream=stream) del x_c, y_c - assert (x_new.sum() - 10.506093500145782) <= 1e-5 + assert (x_new.sum() - 10.506093500145782) <= threshold @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Helpers can only be run when CUDA is available") @pytest.mark.unit - def test_log_sum_exp_neg_inf(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_log_sum_exp_neg_inf(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) # wrapper kernel for device function that is tested @@ -66,8 +73,8 @@ def _kernel(x, y): if x_pos < x.shape[0] and x_pos < y.shape[0]: x[x_pos] = rnnt_helper.log_sum_exp(x[x_pos], y[x_pos]) - x = np.asarray([global_constants.FP32_NEG_INF] * 8) - y = np.ones([len(x)]) + x = np.asarray([global_constants.FP32_NEG_INF] * 8).astype(dtype) + y = np.ones([len(x)]).astype(dtype) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) @@ -88,7 +95,8 @@ def _kernel(x, y): @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Helpers can only be run when CUDA is available") @pytest.mark.unit - def test_div_up(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_div_up(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) # wrapper kernel for device function that is tested @@ -98,8 +106,8 @@ def _kernel(x, y): if x_pos < x.shape[0] and x_pos < y.shape[0]: x[x_pos] = rnnt_helper.div_up(x[x_pos], y[x_pos]) - x = np.full([8], fill_value=10) # np.random.rand(8192) - y = np.full([8], fill_value=2) # np.random.rand(8192) + x = np.full([8], fill_value=10).astype(dtype) # np.random.rand(8192) + y = np.full([8], fill_value=2).astype(dtype) # np.random.rand(8192) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) @@ -121,7 +129,8 @@ def _kernel(x, y): @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Helpers can only be run when CUDA is available") @pytest.mark.unit - def test_add(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_add(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) # wrapper kernel for device function that is tested @@ -131,8 +140,8 @@ def _kernel(x, y): if x_pos < x.shape[0] and x_pos < y.shape[0]: x[x_pos] = rnnt_helper.add(x[x_pos], y[x_pos]) - x = np.full([8], fill_value=10) # np.random.rand(8192) - y = np.full([8], fill_value=2) # np.random.rand(8192) + x = np.full([8], fill_value=10).astype(dtype) # np.random.rand(8192) + y = np.full([8], fill_value=2).astype(dtype) # np.random.rand(8192) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) @@ -154,7 +163,8 @@ def _kernel(x, y): @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Helpers can only be run when CUDA is available") @pytest.mark.unit - def test_maximum(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_maximum(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) # wrapper kernel for device function that is tested @@ -164,8 +174,8 @@ def _kernel(x, y): if x_pos < x.shape[0] and x_pos < y.shape[0]: x[x_pos] = rnnt_helper.maximum(x[x_pos], y[x_pos]) - x = np.full([8], fill_value=10) # np.random.rand(8192) - y = np.full([8], fill_value=2) # np.random.rand(8192) + x = np.full([8], fill_value=10).astype(dtype) # np.random.rand(8192) + y = np.full([8], fill_value=2).astype(dtype) # np.random.rand(8192) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) @@ -187,7 +197,8 @@ def _kernel(x, y): @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Helpers can only be run when CUDA is available") @pytest.mark.unit - def test_identity(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_identity(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) # wrapper kernel for device function that is tested @@ -197,7 +208,7 @@ def _kernel(x): if x_pos < x.shape[0]: x[x_pos] = rnnt_helper.identity(x[x_pos]) - x = np.full([8], fill_value=10) # np.random.rand(8192) + x = np.full([8], fill_value=10).astype(dtype) # np.random.rand(8192) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) @@ -218,7 +229,8 @@ def _kernel(x): @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Helpers can only be run when CUDA is available") @pytest.mark.unit - def test_negate(self): + @pytest.mark.parametrize('dtype', [np.float32, np.float16]) + def test_negate(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) # wrapper kernel for device function that is tested @@ -228,7 +240,7 @@ def _kernel(x): if x_pos < x.shape[0]: x[x_pos] = rnnt_helper.negate(x[x_pos]) - x = np.full([8], fill_value=10) # np.random.rand(8192) + x = np.full([8], fill_value=10).astype(dtype) # np.random.rand(8192) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) @@ -249,7 +261,8 @@ def _kernel(x): @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Helpers can only be run when CUDA is available") @pytest.mark.unit - def test_exponential(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_exponential(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) # wrapper kernel for device function that is tested @@ -259,7 +272,7 @@ def _kernel(x): if x_pos < x.shape[0]: x[x_pos] = rnnt_helper.exponential(x[x_pos]) - x = np.random.rand(8) + x = np.random.rand(8).astype(dtype) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) @@ -281,7 +294,8 @@ def _kernel(x): @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Helpers can only be run when CUDA is available") @pytest.mark.unit - def test_log_plus(self): + @pytest.mark.parametrize('dtype', DTYPES) + def test_log_plus(self, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) # wrapper kernel for device function that is tested @@ -291,8 +305,8 @@ def _kernel(x, y): if x_pos < x.shape[0] and x_pos < y.shape[0]: x[x_pos] = rnnt_helper.log_plus(x[x_pos], y[x_pos]) - x = np.full([8], fill_value=10.0) # np.random.rand(8192) - y = np.full([8], fill_value=2.0) # np.random.rand(8192) + x = np.full([8], fill_value=10.0).astype(dtype) # np.random.rand(8192) + y = np.full([8], fill_value=2.0).astype(dtype) # np.random.rand(8192) stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) @@ -317,12 +331,15 @@ def _kernel(x, y): @pytest.mark.skipif(not cuda.is_available(), reason="CUDA Helpers can only be run when CUDA is available") @pytest.mark.parametrize('batch_size', [8, 128, 256]) @pytest.mark.parametrize('fastemit_lambda', [0.0, 0.001]) + @pytest.mark.parametrize('dtype', DTYPES) @pytest.mark.unit - def test_compute_costs_data(self, batch_size, fastemit_lambda): + def test_compute_costs_data(self, batch_size, fastemit_lambda, dtype): numba_utils.skip_numba_cuda_test_if_unsupported(__NUMBA_MINIMUM_VERSION__) + np.random.seed(0) x = np.full([batch_size], fill_value=0.0) # np.random.rand(8192) - y = np.random.randn(batch_size) # np.random.rand(8192) + y = np.random.randn(batch_size).astype(dtype) # np.random.rand(8192) + threshold = 1e-5 if dtype == np.float32 else 1e-5 stream = cuda.stream() x_c = cuda.to_device(x, stream=stream) @@ -340,11 +357,11 @@ def test_compute_costs_data(self, batch_size, fastemit_lambda): x_new = x_c.copy_to_host(stream=stream) del x_c, y_c - res = -(y.copy()) + res = -(y.astype(np.float32).copy()) res *= 1.0 + fastemit_lambda for i in range(len(x_new)): - assert x_new[i] == res[i], f"index failed {i}" + assert abs(x_new[i] - res[i]) < threshold, f"index failed {i}" if __name__ == '__main__': From a69f0f4417e071965b2f0dc1910687aec4bfc90e Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Wed, 12 Jul 2023 10:21:41 -0700 Subject: [PATCH 100/123] ptuning inference table bug fix (#7015) * remove hardcoded input and output Signed-off-by: arendu * fix inf table Signed-off-by: arendu --------- Signed-off-by: arendu Signed-off-by: Adi Renduchintala --- .../nlp/modules/common/megatron/adapters/parallel_adapters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index fe339c6f9a8b..1818d33dc0d3 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -268,7 +268,7 @@ def __init__( # (@adithyare) the persistent=False will not pollute the indices into the state_dict of this module. self.register_buffer("indices", torch.LongTensor(list(range(self.virtual_tokens))), persistent=False) self.embedding = torch.nn.Embedding(self.virtual_tokens, self.embedding_dim) - self.inference_table = InferenceTable("taskname", self.embedding_dim, self.virtual_tokens) + self.inference_table = InferenceTable("taskname", self.output_dim, self.virtual_tokens) self.first = ColumnParallelLinear( self.embedding_dim, self.bottleneck_dim, From 728403d8c20069865e75effbbb7cfbfccce6589e Mon Sep 17 00:00:00 2001 From: Ryan Langman Date: Wed, 12 Jul 2023 14:08:56 -0700 Subject: [PATCH 101/123] [TTS] Add tutorial for TTS data prep scripts (#6922) * [TTS] Add tutorial for TTS data prep scripts --------- Signed-off-by: Ryan Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --- .../tts/FastPitch_Data_Preparation.ipynb | 1126 +++++++++++++++++ 1 file changed, 1126 insertions(+) create mode 100644 tutorials/tts/FastPitch_Data_Preparation.ipynb diff --git a/tutorials/tts/FastPitch_Data_Preparation.ipynb b/tutorials/tts/FastPitch_Data_Preparation.ipynb new file mode 100644 index 000000000000..99cb32171da7 --- /dev/null +++ b/tutorials/tts/FastPitch_Data_Preparation.ipynb @@ -0,0 +1,1126 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4", + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU", + "gpuClass": "standard" + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Introduction" + ], + "metadata": { + "id": "rtBDkKqVGZJ8" + } + }, + { + "cell_type": "markdown", + "source": [ + "In this tutorial, we will prepare a dataset using our [TTS Dataset Processing Scripts](https://github.com/NVIDIA/NeMo/tree/main/scripts/dataset_processing/tts) and use it for training a FastPitch model.\n", + "\n", + "**This tutorial uses a different workflow than all other existing TTS tutorials. The scripts and classes used are all experimental and not yet ready for production**." + ], + "metadata": { + "id": "pZ2QSsXuGbMe" + } + }, + { + "cell_type": "markdown", + "source": [ + "# License" + ], + "metadata": { + "id": "7X-TwhdTGmlc" + } + }, + { + "cell_type": "markdown", + "source": [ + "> Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n", + ">\n", + "> Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at\n", + ">\n", + "> http://www.apache.org/licenses/LICENSE-2.0\n", + ">\n", + "> Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." + ], + "metadata": { + "id": "fCQUeZRPGnoe" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Install" + ], + "metadata": { + "id": "3OZassNG5xff" + } + }, + { + "cell_type": "code", + "source": [ + "BRANCH = 'main'\n", + "NEMO_ROOT_DIR = '/content/nemo'" + ], + "metadata": { + "id": "QLLoj7bD0W5f" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WZvQvPkIhRi3" + }, + "outputs": [], + "source": [ + "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines\n", + "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n", + "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" + ] + }, + { + "cell_type": "code", + "source": [ + "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", + "# comment out the below lines and set NEMO_ROOT_DIR to your local path.\n", + "!git clone -b $BRANCH https://github.com/NVIDIA/NeMo.git $NEMO_ROOT_DIR" + ], + "metadata": { + "id": "tvsgWO_WhV3M" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Dataset Preparation" + ], + "metadata": { + "id": "fM4QPsLTnzK7" + } + }, + { + "cell_type": "markdown", + "source": [ + "For our tutorial, we use a subset of [VCTK](https://datashare.ed.ac.uk/handle/10283/2950) dataset with 5 speakers (p225-p229)." + ], + "metadata": { + "id": "tkZC6Dl7KRl6" + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "import tarfile\n", + "import wget\n", + "from pathlib import Path\n", + "\n", + "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest" + ], + "metadata": { + "id": "sYzvAYr2vo1K" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Configure nemo paths\n", + "NEMO_DIR = Path(NEMO_ROOT_DIR)\n", + "NEMO_EXAMPLES_DIR = NEMO_DIR / \"examples\" / \"tts\"\n", + "NEMO_CONFIG_DIR = NEMO_EXAMPLES_DIR / \"conf\"\n", + "NEMO_SCRIPT_DIR = NEMO_DIR / \"scripts\" / \"dataset_processing\" / \"tts\"" + ], + "metadata": { + "id": "APo1m5M-v3pB" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Create dataset directory\n", + "root_dir = Path(\"/content\")\n", + "data_root = root_dir / \"data\"\n", + "\n", + "data_root.mkdir(parents=True, exist_ok=True)" + ], + "metadata": { + "id": "aoxN1QsUzX-k" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Download the dataset\n", + "dataset_url = \"https://vctk-subset.s3.amazonaws.com/vctk_subset_multispeaker.tar.gz\"\n", + "dataset_tar_filepath = data_root / \"vctk.tar.gz\"\n", + "\n", + "if not os.path.exists(dataset_tar_filepath):\n", + " wget.download(dataset_url, out=str(dataset_tar_filepath))" + ], + "metadata": { + "id": "mArlQd5Hk36b" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Extract the dataset\n", + "with tarfile.open(dataset_tar_filepath) as tar_f:\n", + " tar_f.extractall(data_root)" + ], + "metadata": { + "id": "p987cjtOy9C7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "DATA_DIR = data_root / \"vctk_subset_multispeaker\"" + ], + "metadata": { + "id": "Ko6dxYJW0i3G" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Visualize the raw dataset\n", + "train_raw_filepath = DATA_DIR / \"train.json\"\n", + "!head $train_raw_filepath" + ], + "metadata": { + "id": "We5FHYQt5BeO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Manifest Processing" + ], + "metadata": { + "id": "i3jsk2HCMSU5" + } + }, + { + "cell_type": "markdown", + "source": [ + "The downloaded manifest uses our traditional format for TTS training. The scripts here require it to be formatted slightly differently.\n", + "\n", + "The `speaker` field used to be an *integer* ID corresponding to an array index that the FastPitch model would query. Now we represent it as a *string* so we can give each speaker a human-friendly name. The mapping from speaker name to speaker index will be provided at training time.\n", + "\n", + "As a best practice, we suggest prepending the `speaker` field with the name of the dataset so that it is guaranteed to be unique across all datasets (eg. *vctk_225*, instead of *225*).\n", + "\n", + "The `audio_filepath` field used to require an *absolute path* which had to be manually updated depending on where the dataset was on your computer. Absolute paths still work, but now you can optionally provide it as a *relative path*, with the root directory provided as an argument to each script." + ], + "metadata": { + "id": "N8WuAGJsMHRn" + } + }, + { + "cell_type": "code", + "source": [ + "def update_metadata(data_type):\n", + " input_filepath = DATA_DIR / f\"{data_type}.json\"\n", + " output_filepath = DATA_DIR / f\"{data_type}_raw.json\"\n", + "\n", + " entries = read_manifest(input_filepath)\n", + " for entry in entries:\n", + " # Provide relative path instead of absolute path\n", + " entry[\"audio_filepath\"] = entry[\"audio_filepath\"].replace(\"audio/\", \"\")\n", + " # Prepend speaker ID with the name of the dataset: 'vctk'\n", + " entry[\"speaker\"] = f\"vctk_{entry['speaker']}\"\n", + "\n", + " write_manifest(output_path=output_filepath, target_manifest=entries, ensure_ascii=False)" + ], + "metadata": { + "id": "zoCRrKQ20VZP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "update_metadata(\"dev\")\n", + "update_metadata(\"train\")" + ], + "metadata": { + "id": "PaCc3GCG1UbH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Visualize updated 'audio_filepath' and 'speaker' fields\n", + "train_filepath = DATA_DIR / \"train_raw.json\"\n", + "!head $train_filepath" + ], + "metadata": { + "id": "bVLIB3Ip1Aqn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Text Preprocessing" + ], + "metadata": { + "id": "e3jHTOhL1M5_" + } + }, + { + "cell_type": "markdown", + "source": [ + "First we will process the text transcripts using the script [preprocess_text.py](https://github.com/NVIDIA/NeMo/blob/main/scripts/dataset_processing/tts/preprocess_text.py).\n", + "\n", + "This step mainly passes the text through our NeMo *text normalizer* and then stores the output in the `normalized_text` field. It also has a few optional transformations, such as lowercasing the text." + ], + "metadata": { + "id": "H2rYykFLSR5t" + } + }, + { + "cell_type": "code", + "source": [ + "text_preprocessing_script = NEMO_SCRIPT_DIR / \"preprocess_text.py\"\n", + "\n", + "# Number of threads to parallelize text processing across\n", + "num_workers = 4\n", + "# Text normalizer to apply\n", + "normalizer_config_filepath = NEMO_CONFIG_DIR / \"text\" / \"normalizer_en.yaml\"\n", + "# Whether to lowercase output text. We can safely do this here because we will train on IPA phonemes.\n", + "# If training on graphemes only, then consider disabling this to leave text with its original capitalization.\n", + "lower_case = True\n", + "# Whether to overwrite output manifest, if it exists\n", + "overwrite_manifest = True\n", + "\n", + "# Python wrapper to invoke the given bash script with the given input args\n", + "def run_script(script, args):\n", + " args = ' \\\\'.join(args)\n", + " cmd = f\"python {script} \\\\{args}\"\n", + "\n", + " print(cmd.replace(\" \\\\\", \"\\n\"))\n", + " print()\n", + " !$cmd\n", + "\n", + "def preprocess_text(data_type):\n", + " input_filepath = DATA_DIR / f\"{data_type}_raw.json\"\n", + " output_filepath = DATA_DIR / f\"{data_type}_text.json\"\n", + "\n", + " args = [\n", + " f\"--input_manifest={input_filepath}\",\n", + " f\"--output_manifest={output_filepath}\",\n", + " f\"--num_workers={num_workers}\",\n", + " f\"--normalizer_config_path={normalizer_config_filepath}\",\n", + " f\"--lower_case={lower_case}\"\n", + " ]\n", + " if overwrite_manifest:\n", + " args.append(\"--overwrite\")\n", + "\n", + " run_script(text_preprocessing_script, args)" + ], + "metadata": { + "id": "6Z1vRsPd0g2s" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "preprocess_text(\"dev\")" + ], + "metadata": { + "id": "qg6iK3NyrZvx" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "preprocess_text(\"train\")" + ], + "metadata": { + "id": "DkLhSL_n1QAS" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Visualize the output of the 'normalized_text' field.\n", + "train_text_filepath = DATA_DIR / \"train_text.json\"\n", + "!head $train_text_filepath" + ], + "metadata": { + "id": "6qHbl0Cf5kQn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Audio Preprocessing" + ], + "metadata": { + "id": "alrRDWio41qi" + } + }, + { + "cell_type": "markdown", + "source": [ + "Next we process the audio data using [preprocess_audio.py](https://github.com/NVIDIA/NeMo/blob/main/scripts/dataset_processing/tts/preprocess_audio.py).\n", + "\n", + "During this step we apply the following transformations:\n", + "\n", + "1. Resample the audio from 48khz to 44.1khz so that it is compatible with our default training configuration.\n", + "2. Remove long silence from the beginning and end of each audio file. This can be done using an *energy* based approach which will work on clean audio, or using *voice activity detection (VAD)* which also works on audio with background or static noise (eg. from a microphone).\n", + "3. Scale the audio so that files have approximately the same volume level.\n", + "4. Filter out audio files which are too long or too short.\n", + "\n" + ], + "metadata": { + "id": "4WfEaMwpUsFt" + } + }, + { + "cell_type": "code", + "source": [ + "import IPython.display as ipd" + ], + "metadata": { + "id": "WEvIefjnd7AG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "audio_preprocessing_script = NEMO_SCRIPT_DIR / \"preprocess_audio.py\"\n", + "\n", + "# Directory with raw audio data\n", + "input_audio_dir = DATA_DIR / \"audio\"\n", + "# Directory to write preprocessed audio to\n", + "output_audio_dir = DATA_DIR / \"audio_44khz\"\n", + "# Whether to overwrite existing audio, if it exists in the output directory\n", + "overwrite_audio = True\n", + "# Whether to overwrite output manifest, if it exists\n", + "overwrite_manifest = True\n", + "# Number of threads to parallelize audio processing across\n", + "num_workers = 4\n", + "# Downsample data from 48khz to 44.1khz for compatibility\n", + "output_sample_rate = 44100\n", + "# Method for silence trimming. Can use \"energy.yaml\" or \"vad.yaml\".\n", + "# We use VAD for VCTK because the audio has background noise.\n", + "trim_config_path = NEMO_CONFIG_DIR / \"trim\" / \"vad.yaml\"\n", + "# Volume level (0, 1] to normalize audio to\n", + "volume_level = 0.95\n", + "# Filter out audio shorter than min_duration or longer than max_duration seconds.\n", + "# We set these bounds relatively low/high, as we can place stricter limits at training time\n", + "min_duration = 0.25\n", + "max_duration = 30.0\n", + "# Output file with entries that are filtered out based on duration\n", + "filter_file = DATA_DIR / \"filtered.json\"\n", + "\n", + "def preprocess_audio(data_type):\n", + " input_filepath = DATA_DIR / f\"{data_type}_text.json\"\n", + " output_filepath = DATA_DIR / f\"{data_type}_manifest.json\"\n", + "\n", + " args = [\n", + " f\"--input_manifest={input_filepath}\",\n", + " f\"--output_manifest={output_filepath}\",\n", + " f\"--input_audio_dir={input_audio_dir}\",\n", + " f\"--output_audio_dir={output_audio_dir}\",\n", + " f\"--num_workers={num_workers}\",\n", + " f\"--output_sample_rate={output_sample_rate}\",\n", + " f\"--trim_config_path={trim_config_path}\",\n", + " f\"--volume_level={volume_level}\",\n", + " f\"--min_duration={min_duration}\",\n", + " f\"--max_duration={max_duration}\",\n", + " f\"--filter_file={filter_file}\",\n", + " ]\n", + " if overwrite_manifest:\n", + " args.append(\"--overwrite_manifest\")\n", + " if overwrite_audio:\n", + " args.append(\"--overwrite_audio\")\n", + "\n", + " run_script(audio_preprocessing_script, args)" + ], + "metadata": { + "id": "0kQ1UDnGfdX6" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "preprocess_audio(\"dev\")" + ], + "metadata": { + "id": "ai0zbXSOriuY" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "preprocess_audio(\"train\")" + ], + "metadata": { + "id": "NUKnidQYfgDo" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We should listen to a few audio files before and after the processing so be sure we configured it correctly.\n", + "\n", + "Note that the processed audio is louder. It is also shorter because we trimmed the leading and trailing silence." + ], + "metadata": { + "id": "x2yhJtsj2lDR" + } + }, + { + "cell_type": "code", + "source": [ + "audio_file = \"p228_009.wav\"\n", + "audio_filepath = input_audio_dir / audio_file\n", + "processed_audio_filepath = output_audio_dir / audio_file\n", + "\n", + "print(\"Original audio.\")\n", + "ipd.display(ipd.Audio(audio_filepath))\n", + "\n", + "print(\"Processed audio.\")\n", + "ipd.display(ipd.Audio(processed_audio_filepath))" + ], + "metadata": { + "id": "_fM3GwJxkjOA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Speaker Mapping" + ], + "metadata": { + "id": "d129p0nrr3PD" + } + }, + { + "cell_type": "markdown", + "source": [ + "We can use [create_speaker_map.py](https://github.com/NVIDIA/NeMo/blob/main/scripts/dataset_processing/tts/create_speaker_map.py) to easily create a mapping from speaker ID strings to integer indices that will be used at training time.\n", + "\n", + "The script will simply sort the speaker IDs and assign them numbers `[0, num_speakers)` in alphabetical order." + ], + "metadata": { + "id": "ZJ1MWX3F3X9u" + } + }, + { + "cell_type": "code", + "source": [ + "speaker_map_script = NEMO_SCRIPT_DIR / \"create_speaker_map.py\"\n", + "\n", + "train_manifest_filepath = DATA_DIR / \"train_manifest.json\"\n", + "dev_manifest_filepath = DATA_DIR / \"dev_manifest.json\"\n", + "speaker_filepath = DATA_DIR / \"speakers.json\"\n", + "\n", + "args = [\n", + " f\"--manifest_path={train_manifest_filepath}\",\n", + " f\"--manifest_path={dev_manifest_filepath}\",\n", + " f\"--speaker_map_path={speaker_filepath}\"\n", + "]\n", + "\n", + "run_script(speaker_map_script, args)" + ], + "metadata": { + "id": "b5gdccYhr5Gk" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Visualize the speaker map file.\n", + "!head $speaker_filepath" + ], + "metadata": { + "id": "CMcC2Nqmt5AR" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Feature Computation" + ], + "metadata": { + "id": "jyFxOjy6t8vo" + } + }, + { + "cell_type": "markdown", + "source": [ + "Before training FastPitch, we need to compute some features for every audio file. The default [config file](https://github.com/NVIDIA/NeMo/blob/main/examples/tts/conf/feature/feature_44100.yaml) we will use has parameters for computing the **pitch** and **energy** of every audio frame. Be default it will also compute a **voiced_mask** indicating which audio frames have no pitch (eg. because they contain silence)." + ], + "metadata": { + "id": "QNPpwkM49orB" + } + }, + { + "cell_type": "code", + "source": [ + "feature_script = NEMO_SCRIPT_DIR / \"compute_features.py\"\n", + "\n", + "feature_config_path = NEMO_CONFIG_DIR / \"feature\" / \"feature_44100.yaml\"\n", + "audio_dir = DATA_DIR / \"audio_44khz\"\n", + "feature_dir = DATA_DIR / \"features_44khz\"\n", + "num_workers = 4\n", + "\n", + "def compute_features(data_type):\n", + " input_filepath = DATA_DIR / f\"{data_type}_manifest.json\"\n", + "\n", + " args = [\n", + " f\"--feature_config_path={feature_config_path}\",\n", + " f\"--manifest_path={input_filepath}\",\n", + " f\"--audio_dir={audio_dir}\",\n", + " f\"--feature_dir={feature_dir}\",\n", + " f\"--num_workers={num_workers}\"\n", + " ]\n", + "\n", + " run_script(feature_script, args)" + ], + "metadata": { + "id": "AI4aLRFbt_NQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "compute_features(\"dev\")" + ], + "metadata": { + "id": "kQqPw3uRwEsO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "compute_features(\"train\")" + ], + "metadata": { + "id": "ct1fN_4pwCu9" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The features are stored in the specified `feature_dir`." + ], + "metadata": { + "id": "db83_UcOCOIo" + } + }, + { + "cell_type": "code", + "source": [ + "!ls $feature_dir" + ], + "metadata": { + "id": "_8bHP4j56LWG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Feature Statistics" + ], + "metadata": { + "id": "QsuxK1P0x7hZ" + } + }, + { + "cell_type": "markdown", + "source": [ + "For training it is beneficial for us to *normalize* our features. The most standard approach is to apply *mean-variance normalization* so that each feature has a mean of 0 and variance of 1. To do this we need to compute the *dataset statistics* with the mean and variance of each feature.\n", + "\n", + "For TTS it also helps\n", + "* Normalize features using speaker-level statistics.\n", + "* Use the `voiced_mask` to set the feature values of non-voiced audio frames to 0.\n", + "\n", + "Using the [compute_feature_stats.py](https://github.com/NVIDIA/NeMo/blob/main/scripts/dataset_processing/tts/compute_feature_stats.py) script we will compute the mean and variance of each feature for each speaker. The input to the script is the same [config file](https://github.com/NVIDIA/NeMo/blob/main/examples/tts/conf/feature/feature_44100.yaml) we used to compute the features." + ], + "metadata": { + "id": "O8GiAnAMCNeh" + } + }, + { + "cell_type": "code", + "source": [ + "feature_stats_script = NEMO_SCRIPT_DIR / \"compute_feature_stats.py\"\n", + "\n", + "train_manifest_filepath = DATA_DIR / \"train_manifest.json\"\n", + "output_stats_path = DATA_DIR / \"feature_stats_44khz.json\"\n", + "\n", + "args = [\n", + " f\"--feature_config_path={feature_config_path}\",\n", + " f\"--manifest_path={train_manifest_filepath}\",\n", + " f\"--audio_dir={audio_dir}\",\n", + " f\"--feature_dir={feature_dir}\",\n", + " f\"--stats_path={output_stats_path}\"\n", + "]\n", + "\n", + "run_script(feature_stats_script, args)" + ], + "metadata": { + "id": "DC4c1L3CxH-h" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The output feature statistics file contains the mean and variance of the pitch and energy for the entire dataset (under the key `global`), and for each speaker in the dataset." + ], + "metadata": { + "id": "zos96yaoFho1" + } + }, + { + "cell_type": "code", + "source": [ + "!head $output_stats_path" + ], + "metadata": { + "id": "fOz1cpIdFcG9" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# HiFi-GAN Training" + ], + "metadata": { + "id": "oRO842MUyODC" + } + }, + { + "cell_type": "markdown", + "source": [ + "Our standard FastPitch model is a two-part recipe consisting of the **FastPitch** acoustic model which predicts a mel spectrogram from text, and **HiFi-GAN** vocoder which predicts audio from the mel spectrogram.\n", + "\n", + "We will train HiFi-GAN first so that we can use it to help evaluate the performance of FastPitch as it is being trained.\n", + "\n", + "HiFi-GAN training only requires a manifest with with the `audio_filepath` field. All other fields in the manifest are for FastPitch training.\n", + "\n", + "Here we show how to train these models from scratch. You can also fine-tune them from pretrained checkpoints as mentioned in our [FastPitch fine-tuning tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tts/FastPitch_Finetuning.ipynb), but pretrained checkpoints compatible with these experimental recipes are not yet available on NGC.\n" + ], + "metadata": { + "id": "E4wUKYOfH8ax" + } + }, + { + "cell_type": "code", + "source": [ + "import torch" + ], + "metadata": { + "id": "pqfl9jAYMJob" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "dataset_name = \"vctk\"\n", + "audio_dir = DATA_DIR / \"audio_44khz\"\n", + "train_manifest_filepath = DATA_DIR / \"train_manifest.json\"\n", + "dev_manifest_filepath = DATA_DIR / \"dev_manifest.json\"" + ], + "metadata": { + "id": "jK2rr-Kr6Qg8" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "hifigan_training_script = NEMO_EXAMPLES_DIR / \"hifigan.py\"\n", + "\n", + "# The total number of training steps will be (epochs * steps_per_epoch)\n", + "epochs = 10\n", + "steps_per_epoch = 10\n", + "\n", + "sample_rate = 44100\n", + "\n", + "# Config files specifying all HiFi-GAN parameters\n", + "hifigan_config_dir = NEMO_CONFIG_DIR / \"hifigan\"\n", + "hifigan_config_filename = \"hifigan_data.yaml\"\n", + "feature_config = f\"feature_{sample_rate}\"\n", + "sample_config = f\"sample_{sample_rate}\"\n", + "\n", + "# Name of the experiment that will determine where it is saved locally and in TensorBoard and WandB\n", + "run_id = \"test_run\"\n", + "exp_dir = root_dir / \"exps\"\n", + "hifigan_exp_output_dir = exp_dir / \"HifiGan\" / run_id\n", + "# Directory where predicted audio will be stored periodically throughout training\n", + "hifigan_log_dir = hifigan_exp_output_dir / \"logs\"\n", + "\n", + "if sample_rate == 22050:\n", + " generator_config = \"v1\"\n", + "elif sample_rate == 44100:\n", + " generator_config = \"v1_44100\"\n", + "else:\n", + " raise ValueError(f\"Unsupported sampling rate {sample_rate}\")\n", + "\n", + "if torch.cuda.is_available():\n", + " accelerator=\"gpu\"\n", + " batch_size = 16\n", + "else:\n", + " accelerator=\"cpu\"\n", + " batch_size = 2\n", + "\n", + "args = [\n", + " f\"--config-path={hifigan_config_dir}\",\n", + " f\"--config-dir={NEMO_CONFIG_DIR}\",\n", + " f\"--config-name={hifigan_config_filename}\",\n", + " f\"feature={feature_config}\",\n", + " f\"sample={sample_config}\",\n", + " f'model/generator={generator_config}',\n", + " f\"max_epochs={epochs}\",\n", + " f\"weighted_sampling_steps_per_epoch={steps_per_epoch}\",\n", + " f\"batch_size={batch_size}\",\n", + " f\"log_dir={hifigan_log_dir}\",\n", + " f\"exp_manager.exp_dir={exp_dir}\",\n", + " f\"+exp_manager.version={run_id}\",\n", + " f\"trainer.accelerator={accelerator}\",\n", + " f\"+train_ds_meta.{dataset_name}.manifest_path={train_manifest_filepath}\",\n", + " f\"+train_ds_meta.{dataset_name}.audio_dir={audio_dir}\",\n", + " f\"+val_ds_meta.{dataset_name}.manifest_path={dev_manifest_filepath}\",\n", + " f\"+val_ds_meta.{dataset_name}.audio_dir={audio_dir}\",\n", + " f\"+log_ds_meta.{dataset_name}.manifest_path={dev_manifest_filepath}\",\n", + " f\"+log_ds_meta.{dataset_name}.audio_dir={audio_dir}\"\n", + "]" + ], + "metadata": { + "id": "Vr4D-NB-yQx8" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# If an error occurs, log the entire stacktrace.\n", + "os.environ[\"HYDRA_FULL_ERROR\"] = \"1\"" + ], + "metadata": { + "id": "Bn8lQG0PxWGi" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "run_script(hifigan_training_script, args)" + ], + "metadata": { + "id": "yUxFCNrE3Ywi" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "During training, the model will automatically save predictions for all files specified in the `log_ds_meta` manifest." + ], + "metadata": { + "id": "BBPIpS-lL6z9" + } + }, + { + "cell_type": "code", + "source": [ + "hifigan_log_epoch_dir = hifigan_log_dir / \"epoch_10\"\n", + "!ls $hifigan_log_epoch_dir" + ], + "metadata": { + "id": "rSFOm1Sg46Lh" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "This makes it easy to listen to the audio to determine how well the model is performing. We can decide to stop training when either:\n", + "\n", + "* The predicted audio sounds almost exactly the same as the original audio\n", + "* The predicted audio stops improving in between epochs.\n", + "\n", + "**Note that the dataset in this tutorial is too small to get good quality audio output.**" + ], + "metadata": { + "id": "oCJs7oCLMIjD" + } + }, + { + "cell_type": "code", + "source": [ + "audio_filepath = hifigan_log_epoch_dir / \"p225_143.wav\"\n", + "ipd.display(ipd.Audio(audio_filepath))" + ], + "metadata": { + "id": "G6k4ymzfJ5Y6" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# FastPitch Training" + ], + "metadata": { + "id": "lV--2Wph7NPG" + } + }, + { + "cell_type": "markdown", + "source": [ + "Finally we can train the FastPitch model itself. The FastPitch training recipe requires:\n", + "\n", + "1. Training manifest(s) with `audio_filepath` and `text` or `normalized_text` fields.\n", + "2. Precomputed features such as *pitch* and *energy* specified in the feature [config file](https://github.com/NVIDIA/NeMo/blob/main/examples/tts/conf/feature/feature_44100.yaml).\n", + "3. (Optional) Statistics file for normalizing features.\n", + "4. (Optional) For a multi-speaker model, the manifest needs a `speaker` field amd JSON file mapping speaker IDs to speaker indices.\n", + "5. (Optional) To train with IPA phonemes, a [phoneme dictionary](https://github.com/NVIDIA/NeMo/blob/main/scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt) and optional [heteronyms file](https://github.com/NVIDIA/NeMo/blob/main/scripts/tts_dataset_files/heteronyms-052722)\n", + "6. (Optional) HiFi-GAN checkpoint or [NGC model name](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/models/hifigan.py#L413) for generating audio predictions during training.\n", + "\n" + ], + "metadata": { + "id": "aOuoPXDhOVD7" + } + }, + { + "cell_type": "code", + "source": [ + "fastpitch_training_script = NEMO_EXAMPLES_DIR / \"fastpitch.py\"\n", + "\n", + "# The total number of training steps will be (epochs * steps_per_epoch)\n", + "epochs = 10\n", + "steps_per_epoch = 10\n", + "\n", + "num_speakers = 5\n", + "sample_rate = 44100\n", + "\n", + "# Config files specifying all FastPitch parameters\n", + "fastpitch_config_dir = NEMO_CONFIG_DIR / \"fastpitch\"\n", + "fastpitch_config_filename = \"fastpitch.yaml\"\n", + "feature_config = f\"feature_{sample_rate}\"\n", + "\n", + "# Metadata files and directories\n", + "dataset_file_dir = NEMO_DIR / \"scripts\" / \"tts_dataset_files\"\n", + "phoneme_dict_path = dataset_file_dir / \"ipa_cmudict-0.7b_nv23.01.txt\"\n", + "heteronyms_path = dataset_file_dir / \"heteronyms-052722\"\n", + "\n", + "speaker_path = DATA_DIR / \"speakers.json\"\n", + "feature_dir = DATA_DIR / \"features_44khz\"\n", + "stats_path = DATA_DIR / \"feature_stats_44khz.json\"\n", + "\n", + "def get_latest_checkpoint(checkpoint_dir):\n", + " output_path = None\n", + " for checkpoint_path in checkpoint_dir.iterdir():\n", + " checkpoint_name = str(checkpoint_path.name)\n", + " if checkpoint_name.endswith(\".nemo\"):\n", + " output_path = checkpoint_path\n", + " break\n", + " if checkpoint_name.endswith(\"last.ckpt\"):\n", + " output_path = checkpoint_path\n", + "\n", + " if not output_path:\n", + " raise ValueError(f\"Could not find latest checkpoint in {checkpoint_dir}\")\n", + "\n", + " return output_path\n", + "\n", + "# HiFi-GAN model for generating audio predictions from FastPitch output\n", + "vocoder_type = \"hifigan\"\n", + "vocoder_checkpoint_path = get_latest_checkpoint(hifigan_exp_output_dir / \"checkpoints\")\n", + "\n", + "run_id = \"test_run\"\n", + "exp_dir = root_dir / \"exps\"\n", + "fastpitch_exp_output_dir = exp_dir / \"FastPitch\" / run_id\n", + "fastpitch_log_dir = fastpitch_exp_output_dir / \"logs\"\n", + "\n", + "if torch.cuda.is_available():\n", + " accelerator=\"gpu\"\n", + " batch_size = 32\n", + "else:\n", + " accelerator=\"cpu\"\n", + " batch_size = 4\n", + "\n", + "args = [\n", + " f\"--config-path={fastpitch_config_dir}\",\n", + " f\"--config-dir={NEMO_CONFIG_DIR}\",\n", + " f\"--config-name={fastpitch_config_filename}\",\n", + " f\"feature={feature_config}\",\n", + " f\"n_speakers={num_speakers}\",\n", + " f\"speaker_path={speaker_path}\",\n", + " f\"max_epochs={epochs}\",\n", + " f\"weighted_sampling_steps_per_epoch={steps_per_epoch}\",\n", + " f\"phoneme_dict_path={phoneme_dict_path}\",\n", + " f\"heteronyms_path={heteronyms_path}\",\n", + " f\"feature_stats_path={stats_path}\",\n", + " f\"log_dir={fastpitch_log_dir}\",\n", + " f\"vocoder_type={vocoder_type}\",\n", + " f\"vocoder_checkpoint_path=\\\\'{vocoder_checkpoint_path}\\\\'\",\n", + " f\"trainer.accelerator={accelerator}\",\n", + " f\"exp_manager.exp_dir={exp_dir}\",\n", + " f\"+exp_manager.version={run_id}\",\n", + " f\"+train_ds_meta.{dataset_name}.manifest_path={train_manifest_filepath}\",\n", + " f\"+train_ds_meta.{dataset_name}.audio_dir={audio_dir}\",\n", + " f\"+train_ds_meta.{dataset_name}.feature_dir={feature_dir}\",\n", + " f\"+val_ds_meta.{dataset_name}.manifest_path={dev_manifest_filepath}\",\n", + " f\"+val_ds_meta.{dataset_name}.audio_dir={audio_dir}\",\n", + " f\"+val_ds_meta.{dataset_name}.feature_dir={feature_dir}\",\n", + " f\"+log_ds_meta.{dataset_name}.manifest_path={dev_manifest_filepath}\",\n", + " f\"+log_ds_meta.{dataset_name}.audio_dir={audio_dir}\",\n", + " f\"+log_ds_meta.{dataset_name}.feature_dir={feature_dir}\"\n", + "]" + ], + "metadata": { + "id": "8MdMXnOAIFvj" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "run_script(fastpitch_training_script, args)" + ], + "metadata": { + "id": "apl7TvW0TaEG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "During training, the model will automatically save spectrogram and audio predictions for all files specified in the `log_ds_meta` manifest." + ], + "metadata": { + "id": "Z01Fq7WRl7Di" + } + }, + { + "cell_type": "code", + "source": [ + "faspitch_log_epoch_dir = fastpitch_log_dir / \"epoch_10\"\n", + "!ls $faspitch_log_epoch_dir" + ], + "metadata": { + "id": "E8rVKnKN5HDa" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "This makes it easy to listen to the audio to determine how well the model is performing. We can decide to stop training when either:\n", + "\n", + "* The predicted audio stops improving in between epochs.\n", + "* The predicted spectrogram stops changing in between epochs.\n", + "\n", + "**Note that the dataset in this tutorial is too small to get good quality audio output.**" + ], + "metadata": { + "id": "PeNaxoCzN7Ii" + } + }, + { + "cell_type": "code", + "source": [ + "audio_filepath = faspitch_log_epoch_dir / \"p225_143.wav\"\n", + "spectrogram_filepath = faspitch_log_epoch_dir / \"p225_143_spec.png\"\n", + "\n", + "ipd.display(ipd.Audio(audio_filepath))\n", + "ipd.display(ipd.Image(spectrogram_filepath))" + ], + "metadata": { + "id": "ynZdcnKc3CRF" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file From 36adc7e0943a9654c21a89a8b5b8b33ed6c2efcc Mon Sep 17 00:00:00 2001 From: Sandeep Subramanian Date: Wed, 12 Jul 2023 23:21:43 -0700 Subject: [PATCH 102/123] Fix missing import (#7026) Signed-off-by: MaximumEntropy --- .../nlp/models/language_modeling/megatron_gpt_sft_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index 946df3da2aa5..c80d2272613e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -13,6 +13,7 @@ # limitations under the License. import json +from functools import partial from typing import Any, Optional import torch From 5e02346506851b086abcf68976e9bbf91a6a8514 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 13 Jul 2023 20:05:11 +0400 Subject: [PATCH 103/123] fix install_beamsearch_decoders (#7011) * aliases Signed-off-by: Nikolay Karpov * add NEMO_PATH Signed-off-by: Nikolay Karpov * expand_aliases Signed-off-by: Nikolay Karpov --------- Signed-off-by: Nikolay Karpov --- .../ngram_lm/install_beamsearch_decoders.sh | 13 ++++++++---- tutorials/asr/Offline_ASR.ipynb | 20 ++++++++++--------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh b/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh index 0760dd02319d..558a84698f49 100755 --- a/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh +++ b/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh @@ -14,6 +14,8 @@ # limitations under the License. # Use this script to install KenLM, OpenSeq2Seq decoder, Flashlight decoder +shopt -s expand_aliases + NEMO_PATH=/workspace/nemo # Path to NeMo folder: /workspace/nemo if you use NeMo/Dockerfile if [ "$#" -eq 1 ] then @@ -24,11 +26,14 @@ KENLM_MAX_ORDER=10 # Maximum order of KenLM model, also specified in the setup_o cd $NEMO_PATH if [ $(id -u) -eq 0 ]; then - alias sudo=eval + alias aptupdate='apt-get update' + alias b2install='./b2' +else + alias aptupdate='sudo apt-get update' + alias b2install='sudo ./b2' fi -sudo apt-get update && apt-get upgrade -y && apt-get install -y liblzma-dev && rm -rf /var/lib/apt/lists/* # liblzma needed for flashlight decoder - +aptupdate && apt-get upgrade -y && apt-get install -y liblzma-dev && rm -rf /var/lib/apt/lists/* # liblzma needed for flashlight decoder' git clone https://github.com/NVIDIA/OpenSeq2Seq cd OpenSeq2Seq @@ -42,7 +47,7 @@ cp $NEMO_PATH/scripts/installers/setup_os2s_decoders.py ./setup.py ./setup.sh # install Boost package for KenLM -wget https://boostorg.jfrog.io/artifactory/main/release/1.80.0/source/boost_1_80_0.tar.bz2 --no-check-certificate && tar --bzip2 -xf $NEMO_PATH/decoders/boost_1_80_0.tar.bz2 && cd boost_1_80_0 && ./bootstrap.sh && sudo ./b2 --layout=tagged link=static,shared threading=multi,single install -j4 || echo FAILURE +wget https://boostorg.jfrog.io/artifactory/main/release/1.80.0/source/boost_1_80_0.tar.bz2 --no-check-certificate && tar --bzip2 -xf $NEMO_PATH/decoders/boost_1_80_0.tar.bz2 && cd boost_1_80_0 && ./bootstrap.sh && b2install --layout=tagged link=static,shared threading=multi,single install -j4 || echo FAILURE export BOOST_ROOT=$NEMO_PATH/decoders/boost_1_80_0 # install KenLM diff --git a/tutorials/asr/Offline_ASR.ipynb b/tutorials/asr/Offline_ASR.ipynb index fc8af2e76416..82fb5401a3fe 100644 --- a/tutorials/asr/Offline_ASR.ipynb +++ b/tutorials/asr/Offline_ASR.ipynb @@ -3,9 +3,7 @@ "nbformat_minor": 0, "metadata": { "colab": { - "name": "Offline_ASR.ipynb", "provenance": [], - "collapsed_sections": [], "toc_visible": true }, "kernelspec": { @@ -31,7 +29,9 @@ "\n", "You may find more info on how to train and use language models for ASR models here:\n", "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html\n", - "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n" + "\n", + "\n", + "NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n" ] }, { @@ -41,7 +41,7 @@ }, "source": [ "## Installation\n", - "NeMo can be installed via simple pip command. \n", + "NeMo can be installed via simple pip command.\n", "\n", "Optional CTC beam search decoder might require restart of Colab runtime after installation." ] @@ -77,12 +77,14 @@ " import ctc_decoders\n", "except ModuleNotFoundError:\n", " # install beam search decoder\n", + " import os\n", " !apt-get update && apt-get install -y swig\n", " !git clone https://github.com/NVIDIA/NeMo -b \"$BRANCH\"\n", - " !cd NeMo && bash scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh\n", + " pwd = !pwd\n", + " NEMO_PATH = os.path.join(pwd[0], \"NeMo\")\n", + " !cd NeMo && bash scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh $NEMO_PATH\n", " print('Restarting Colab runtime to successfully import built module.')\n", " print('Please re-run the notebook.')\n", - " import os\n", " os.kill(os.getpid(), 9)" ], "execution_count": null, @@ -110,7 +112,7 @@ }, "source": [ "## Instantiate pre-trained NeMo model\n", - "``from_pretrained(...)`` API downloads and initializes model directly from the cloud. \n", + "``from_pretrained(...)`` API downloads and initializes model directly from the cloud.\n", "\n", "Alternatively, ``restore_from(...)`` allows loading a model from a disk.\n", "\n", @@ -403,7 +405,7 @@ "source": [ "## Offline inference with beam search decoder and N-gram language model re-scoring\n", "\n", - "It is possible to use an external [KenLM](https://kheafield.com/code/kenlm/)-based N-gram language model to rescore multiple transcription candidates. \n", + "It is possible to use an external [KenLM](https://kheafield.com/code/kenlm/)-based N-gram language model to rescore multiple transcription candidates.\n", "\n", "Let's download and preprocess LibriSpeech 3-gram language model." ] @@ -653,4 +655,4 @@ "outputs": [] } ] -} +} \ No newline at end of file From caddb8dc4cdd5ed991697566ecca7f44b8751693 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Thu, 13 Jul 2023 10:42:14 -0700 Subject: [PATCH 104/123] Update SDP docs page with a new documentation link (#7029) Signed-off-by: Igor Gitman --- docs/source/tools/speech_data_processor.rst | 162 +------------------- 1 file changed, 3 insertions(+), 159 deletions(-) diff --git a/docs/source/tools/speech_data_processor.rst b/docs/source/tools/speech_data_processor.rst index 29bc4abb82bd..262b214c6355 100644 --- a/docs/source/tools/speech_data_processor.rst +++ b/docs/source/tools/speech_data_processor.rst @@ -1,166 +1,10 @@ Speech Data Processor -======================== +===================== Speech Data Processor (SDP) is a toolkit to make it easy to: 1. write code to process a new dataset, minimizing the amount of boilerplate code required. 2. share the steps for processing a speech dataset. -SDP is hosted here: https://github.com/NVIDIA/NeMo-speech-data-processor. +SDP is hosted here: https://github.com/NVIDIA/NeMo-speech-data-processor. -SDP's philosophy is to represent processing operations as 'processor' classes, which take in a path to a NeMo-style data manifest as input (or a path to the raw data directory if you do not have a NeMo-style manifest to start with), apply some processing to it, and then save the output manifest file. - -You specifiy which processors you want to run using a YAML config file. Many common processing operations are provided, and it is easy to add your own. If you do not need to add your own processors, then all that is needed to process a new dataset is to write a single YAML file containing the parameters needed to process your dataset. - -.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.17.0/sdp_overview_diagram.png - :alt: Overview diagram of Speech Data Processor - -Overview of how SDP processes a dataset ---------------------------------------- - -1. You call the ``main.py`` script, passing in a YAML config file, possibly with some overrides. -2. ``main.py`` script calls ``run_processors.py``, passing in your config. -3. ``run_processors.py`` does the following: - - a. picks out the processors that you specified to be run (you can specify a subset of the processors in the config override, e.g. to avoid re-running time-consuming steps). - b. if some of the processors have not had "output_manifest_file" or "input_manfiest_file" entries specified, SDP will automatically create temporary files for those. - c. instantiates the processor classes using ``hydra.utils.instantiate`` - d. runs the run-time processor tests by calling the ``processor.test()`` method (more details about testing :ref:`here`). - e. runs the processing method (``processor.process()``) of each processor in order. - - -Layout of config YAML files -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The YAML config file for processing a dataset must contain a key ``processors``, the value of which is a list. Each item in that list is expected to be a dictionary specifying a processor class, i.e. it must have a key ``_target_``, the value of which is a path to a "processor" class, and the remaining keys must be the kwargs necessary to instantiate that class with ``hydra.utils.instantiate()`` (c.f. https://hydra.cc/docs/advanced/instantiate_objects/overview/). - -SDP will run the processors specified in the ``processors`` list in the config file. It will also check for a ``processors_to_run`` key in the config file, which can be either the string ``"all"``, or any Python "slice" object like ``3:4``, ``2:`` etc. (if there is no ``processors_to_run`` key, then all of the processors will be run). - -.. note:: - SDP will run the processors in the order in which they are listed in the config YAML file. Make sure to list the processors in an order which makes sense, e.g. create an initial manifest first; make sure to run asr inference before doing any processing which looks at ``pred_text`` fields in the manifest. - -Processor classes ------------------ - -**BaseProcessor** -~~~~~~~~~~~~~~~~~ - -All processor classes inherit from the ``BaseProcessor`` class. This is a simple abstract class which has 2 empty methods: ``process()`` and ``test()``. -These serve to remind us that SDP essentially just runs ``test()`` on all processors, and then ``process()`` on all processors (more details about testing :ref:`here`). - -``ASRInference`` is a child class of ``BaseProcessor``. It has a simple ``process()`` method which runs transcription on every utterance in the input_manifest. - -``WriteManifest`` is also a child class of ``BaseProcessor``. It has a simple ``process()`` method which saves a copy of the input manifest containing only the fields specified in ``fields_to_save``. - -**BaseParallelProcessor** -~~~~~~~~~~~~~~~~~~~~~~~~~ -``BaseParallelProcessor`` inherits from the ``BaseProcessor`` class. Within the ``BaseParallelProcessor.process()`` method, it calls other methods and functions, which allow it to do more complex processing. -Most importantly, it calls its ``BaseParallelProcessor.process_dataset_entry(data_entry)`` method on every utterance in the manifest, and it does this in parallel, allowing for more efficient processing. - -What is a **DataEntry**? -~~~~~~~~~~~~~~~~~~~~~~~~ -As mentioned above, ``BaseParallelProcessor.process_dataset_entry(data_entry)`` is called on a variable called ``data_entry`` which represents an utterance in our dataset. -Most often, ``data_entry`` will be a dictionary containing items which represent the JSON manifest entry. -Sometimes, such as in ``CreateInitialManifestMLS``, it will be a string containing a line for that utterance from the original raw MLS transcript. - -``BaseParallelProcessor.process_dataset_entry`` will process ``data_entry`` and output a ``DataEntry`` object. - -The ``DataEntry`` class is a dataclass which contains 2 attributes: - -1. ``data`` is an Optional dictionary containing items which represent the JSON manifest entry. ``data`` can also be ``None``. If a ``.process_dataset_entry(data_entry)`` method returns a ``DataEntry`` class where ``data is None``, then that utterance will be dropped from the output manifest. -2. ``metrics``, which can be of any type, and are ``None`` by default. This variable is used by some variables to record summary statistics about the changes made to the dataset, these metrics are aggregated and can be displayed once every utterance has been processed by the processor. - -What happens in **BaseParallelProcessor.process()**? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We outline the ``BaseParallelProcessor.process()`` method below: - -.. raw:: html - -
- -
- - -**ModifyManifestTextProcessor** -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -``ModifyManifestTextProcessor`` inherits from the ``BaseParallelProcessor`` class. - -The ``ModifyManifestTextProcessor`` constructor takes in the following arguments: -* ``text_key`` (string) and ``pred_text_key`` (string): these parameters specify which keys in ``data_entry.data`` will be used for processing. (default: ``text_key="text"``, ``pred_text_key="pred_text"``, ie. by default the processor will refer to and modify the ``"text"`` and/or ``"pred_text"`` attributes of the input manifest). -* ``test_cases`` (optional, list of dicts) - test cases for checking that the processor makes the changes that we are expecting. - -``ModifyManifestTextProcessor`` has the following methods: -* ``ModifyManifestTextProcessor.test()``: this method makes sure that the output from the processor matches the expected output specified in the ``test_cases`` parameter. -* ``ModifyManifestTextProcessor.process_dataset_entry(data_entry)``: this method applies processing to a ``data_entry``. First, spaces are added to the start and end of the 'text' and 'pred_text' entries (if they exist), then the abstract method ``ModifyManifestTextProcessor._process_dataset_entry(data_entry)`` is called. Then, any extra spaces (e.g. two spaces next to each other ' ') are removed from 'text' and 'pred_text' entries. -* ``ModifyManifestTextProcessor._process_dataset_entry(data_entry)``: this is an abstract method which will be over-written by children of ``ModifyManifestTextProcessor``. - -How to make your own processor classes --------------------------------------- - -We will describe how to make your own processor classes by referring to SDP's existing classes. - -Creating an initial manifest -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -One of the child classes of ``BaseParallelProcessor`` provided in SDP is ``CreateInitialManifestMLS``. It downloads raw MLS data for a specified language, and creates an initial manifest (in the format expected by NeMo) which can be cleaned by subsequent processors. - -The ``CreateInitialManifestMLS.prepare()`` method downloads and extracts the raw data. - -The ``CreateInitialManifestMLS.read_manifest()`` method reads the lines in the raw MLS transcript file. - -The ``CreateInitialManifestMLS.process_dataset_entry()`` method takes in the lines from the raw MLS transcript file, and outputs ``DataEntry`` objects containing entries that will be saved into the manifest (i.e. ``"audio_filepath"``, ``"duration"``, ``"text"``) for each utterance. - - -A **ModifyManifestTextProcessor** subclass that cleans the reference text -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -One of the classes provided in SDP is ``SubRegex``. At initialization, it takes in ``regex_params_list``, a list of dictionaries which must contain the keys ``"pattern"``, ``"repl"``, and, optionally, ``"count"``. These keys will be used to apply regex substitutions using these parameters fed into ``re.sub``. The substitutions will be applied to the data at ``text_key`` (i.e. ``data_entry.data[self.text_key]``). By default, ``text_key="text"``, i.e. the substitutions will be applied to the ``"text"`` attribute of the manifest. - -In its ``_process_dataset_entry(data_entry)`` method, the ``SubRegex`` processor does the string to string conversion upon the ``data_entry`` that is input. Its output is a ``data_entry`` with the changes applied to ``data``, and the the metrics of which regex patterns caused a substitution to be made. These metrics will be aggregated over all utterances by the ``BaseParallelProcessor`` class. ``SubRegex`` also has a ``finalize(metrics)`` method which will log information about the aggregated metrics after all of the utterances in the manifest have been processed. - -A **ModifyManifestTextProcessor** subclass that drops incorrectly transcribed utterances -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -One of the classes provided in SDP is ``DropHighLowCharrate``. At initialization, it takes in ``high_charrate_threshold`` and ``low_charrate_threshold``, for which the utterance will be dropped if it is above or below each value respectively. This is helpful for automatically filtering out incorrectly transcribed utterances. - -In its ``_process_dataset_entry(data_entry)`` method it evaluates the character rate of the utterance(by dividing the length of ``data_entry.data[self.text_key]`` by the value of ``data_entry.data["duration"]``). If the character rate is within bounds, it will return the same ``data_entry`` that was input. If the character rate is out of bounds, it will return a ``data_entry`` with ``data=None`` and ``metrics`` which reflect the applied changes. -Similar to the ``SubSubstringToSpace`` class, it has a ``finalize(metrics)`` method which will log information about the aggregated metrics after all of the utterances in the manifest have been processed. - -Class diagram -------------- -A diagram of the classes mentioned above is included here. Arrows represent inheritance. - -We omit the details of the ``CreateInitialManifestMLS`` class in the diagram in order to save space. - - -.. raw:: html - -
- -
- -SDP Tests ---------- -It is important to make sure that your data processing code has the effect you intend, so SDP has a few different types of tests: - -1. Runtime tests - -* Before running the specified processors, SDP runs ``processor.test()`` on all specified processors. -* Currently, the only provided processor classes with a test method are subclasses of ``ModifyManifestTextProcessor``. - - * ``ModifyManifestTextProcessor.test()`` runs any ``test_cases`` that were provided in the object constructor. - * This means you can provided test cases in the YAML config file, and the dataset will only be processed if the test cases pass. - * This is helpful to (a) make sure that the rules you wrote have the effect you desired, and (b) demonstrate why you wrote those rules. - * An example of test cases we could include in the YAML config file:: - - - _target_: sdp.processors.DropIfRegexMatch - regex_patterns: - - "(\\D ){5,20}" # looks for between 4 and 19 characters surrounded by spaces - test_cases: - - {input: {text: "some s p a c e d out letters"}, output: null} - - {input: {text: "normal words only"}, output: {text: "normal words only"}} - -2. ``pytest`` tests which can be run locally with ``python -m pytest tests/`` and will be run during the GitHub CI process. There are 2 sub-types: - - a. "End to end" tests (link) which run SDP on a mini version of the raw initial dataset, and make sure the final manifest matches the reference final manifest. - b. "Unit tests" for processors and utils (link). +To learn more about SDP, please check the [documentation](https://nvidia.github.io/NeMo-speech-data-processor/). From 7ccc2cf2c81f48bf3dd352180498a483292bcc7e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 13 Jul 2023 22:32:39 +0400 Subject: [PATCH 105/123] [TTS] Append pretrained FastPitch & SpectrogamEnhancer pair to available models (#7012) (#7013) * [TTS] fastpitch: add english libritts model with asr stft parameters (25 ms 10 ms) * [TTS] enhancer: add pretrained model intended for asr finetuning --------- Signed-off-by: Roman Korostik --- nemo/collections/tts/models/fastpitch.py | 14 +++++++++++++ .../tts/models/spectrogram_enhancer.py | 20 +++++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index dc598a9a76d1..8f0e06ea304d 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -772,6 +772,20 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]': ) list_of_models.append(model) + # en, multi speaker, LibriTTS, 16000 Hz + # stft 25ms 10ms matching ASR params + # for use during Enhlish ASR training/adaptation + model = PretrainedModelInfo( + pretrained_model_name="tts_en_fastpitch_for_asr_finetuning", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch_spectrogram_enhancer_for_asr_finetuning/versions/1.20.0/files/tts_en_fastpitch_for_asr_finetuning.nemo", + description="This model is trained on LibriSpeech, train-960 subset." + " STFT parameters follow those commonly used in ASR: 25 ms window, 10 ms hop." + " This model is supposed to be used with its companion SpetrogramEnhancer for " + " ASR fine-tuning. Usage for regular TTS tasks is not advised.", + class_=cls, + ) + list_of_models.append(model) + return list_of_models # Methods for model exportability diff --git a/nemo/collections/tts/models/spectrogram_enhancer.py b/nemo/collections/tts/models/spectrogram_enhancer.py index bcc7e69a10bf..ca2fe6122230 100644 --- a/nemo/collections/tts/models/spectrogram_enhancer.py +++ b/nemo/collections/tts/models/spectrogram_enhancer.py @@ -56,7 +56,7 @@ HingeLoss, ) from nemo.collections.tts.parts.utils.helpers import mask_sequence_tensor, to_device_recursive -from nemo.core import Exportable, ModelPT, typecheck +from nemo.core import Exportable, ModelPT, PretrainedModelInfo, typecheck from nemo.core.neural_types import LengthsType, MelSpectrogramType, NeuralType from nemo.core.neural_types.elements import BoolType from nemo.utils import logging @@ -277,7 +277,23 @@ def setup_validation_data(self, val_data_config): @classmethod def list_available_models(cls): - return [] + list_of_models = [] + + # en, multi speaker, LibriTTS, 16000 Hz + # stft 25ms 10ms matching ASR params + # for use during Enhlish ASR training/adaptation + model = PretrainedModelInfo( + pretrained_model_name="tts_en_spectrogram_enhancer_for_asr_finetuning", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch_spectrogram_enhancer_for_asr_finetuning/versions/1.20.0/files/tts_en_spectrogram_enhancer_for_asr_finetuning.nemo", + description="This model is trained to add details to synthetic spectrograms." + " It was trained on pairs of real-synthesized spectrograms generated by FastPitch." + " STFT parameters follow ASR with 25 ms window and 10 ms hop." + " It is supposed to be used in conjunction with that model for ASR training/adaptation.", + class_=cls, + ) + list_of_models.append(model) + + return list_of_models def log_illustration(self, target_spectrograms, input_spectrograms, enhanced_spectrograms, lengths): if self.global_rank != 0: From d44127eebc975647d47009948ef65302b802fb32 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 13 Jul 2023 22:34:01 +0400 Subject: [PATCH 106/123] Add ASR with TTS Tutorial. Fix enhancer usage. (#6955) (#7023) * Add ASR with TTS Tutorial * Fix enhancer usage Signed-off-by: Vladimir Bataev Co-authored-by: Vladimir Bataev --- docs/source/starthere/tutorials.rst | 4 +- .../asr/models/hybrid_asr_tts_models.py | 2 + tutorials/asr/ASR_TTS_Tutorial.ipynb | 846 ++++++++++++++++++ 3 files changed, 850 insertions(+), 2 deletions(-) create mode 100644 tutorials/asr/ASR_TTS_Tutorial.ipynb diff --git a/docs/source/starthere/tutorials.rst b/docs/source/starthere/tutorials.rst index e24637718690..2f4ea59cff5c 100644 --- a/docs/source/starthere/tutorials.rst +++ b/docs/source/starthere/tutorials.rst @@ -107,8 +107,8 @@ To run a tutorial: - Multi-lingual ASR - `Multi-lingual ASR `_ * - ASR - - Confidence-based Ensembles - - `Confidence-based Ensembles `_ + - Hybrid ASR-TTS Models Tutorial + - `Multi-lingual ASR `_ * - NLP - Using Pretrained Language Models for Downstream Tasks - `Pretrained Language Models for Downstream Tasks `_ diff --git a/nemo/collections/asr/models/hybrid_asr_tts_models.py b/nemo/collections/asr/models/hybrid_asr_tts_models.py index 8486f956c3b7..8494a093b29d 100644 --- a/nemo/collections/asr/models/hybrid_asr_tts_models.py +++ b/nemo/collections/asr/models/hybrid_asr_tts_models.py @@ -311,8 +311,10 @@ def from_pretrained_models( ) ) else: + cfg = copy.deepcopy(cfg) # copy to avoid modifying original config cfg.tts_model_path = f"{tts_model_path}" cfg.asr_model_path = f"{asr_model_path}" + cfg.enhancer_model_path = f"{enhancer_model_path}" if enhancer_model_path is not None else None return ASRWithTTSModel(cfg, trainer=trainer) def __setattr__(self, name, value): diff --git a/tutorials/asr/ASR_TTS_Tutorial.ipynb b/tutorials/asr/ASR_TTS_Tutorial.ipynb new file mode 100644 index 000000000000..939ef8a28d29 --- /dev/null +++ b/tutorials/asr/ASR_TTS_Tutorial.ipynb @@ -0,0 +1,846 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a3570803-9bfa-4e97-9891-5ae0759eb8ca", + "metadata": {}, + "source": [ + "# Hybrid ASR-TTS Models Tutorial" + ] + }, + { + "cell_type": "markdown", + "id": "50fc294f-f319-4465-8f90-a28b49843e60", + "metadata": {}, + "source": [ + "This tutorial is intended to introduce you to using ASR-TTS Hybrid Models, also known as `ASRWithTTSModel`, to finetune existing ASR models using an integrated text-to-mel-spectrogram generator. " + ] + }, + { + "cell_type": "markdown", + "id": "d2a01ca5-bd48-4d82-a97d-5b07a7b27ca0", + "metadata": {}, + "source": [ + "## ASR-TTS Models: Description" + ] + }, + { + "cell_type": "markdown", + "id": "b32467a9-c458-4590-aff7-e8d1e91b0870", + "metadata": {}, + "source": [ + "### Problem\n", + "\n", + "Adapting ASR models to a new text domain is a challenging task. Modern end-to-end systems can require several hundreds and thousands of hours to perform recognition with high accuracy. Acquiring audio-text paired data for a specific domain can be prohibitively expensive. Text-only data, on the other side, is widely available. \n", + "\n", + "One of the approaches for efficient adaptation is synthesizing audio data from text and using such data for training the ASR model conventionally. We modify this approach, incorporating TTS and ASR systems into a single model. We use only a lightweight multi-speaker text-to-mel-spectrogram generator (without vocoder) with an optional enhancer that mitigates the mismatch between natural and synthetic spectrograms.\n", + "\n", + "### Architecture\n", + "\n", + "\"ASR-TTS\n", + "\n", + "`ASRWithTTSModel` is a transparent wrapper for three models:\n", + "- ASR model (`EncDecCTCModelBPE`, `EncDecRNNTBPEModel` or `EncDecHybridRNNTCTCBPEModel` are supported)\n", + "- frozen text-to-mel-spectrogram model (currently, only `FastPitch` model is supported)\n", + "- optional frozen enhancer model\n", + "\n", + "The architecture is shown in the figure. \n", + "\n", + "The model can take text or audio as input during training. In the case of audio input, a mel spectrogram is extracted as usual and passed to the ASR neural network. In the case of textual input, the mel spectrogram generator produces a spectrogram on the fly from the text. The spectrogram is improved by the enhancer (if present) and fed into the ASR model. \n", + "\n", + "### Capabilities and Limitations\n", + "\n", + "This approach can be used to finetune the pretrained ASR model using text-only data. Training new models from scratch is also possible. The text should contain phrases and sentences and be split into sentences (~45 words maximum, corresponding to ~16.7 seconds of synthesized audio). Using only separate words is not recommended since this doesn't allow to adapt ASR model adapts to recognize new words in context. \n", + "\n", + "Mixing audio-text pairs with text-only data from the original domain is recommended to preserve performance on the original data. \n", + "Also, fusing BatchNorm (see parameters below) is recommended for the best performance when using a large proportion of text compared to the amount of audio-text pairs in finetuning process.\n", + "\n", + "\n", + "### Implementation Details and Experiments\n", + "\n", + "Further details about implementation and experiments can be found in the paper [Text-only domain adaptation for end-to-end ASR using integrated text-to-mel-spectrogram generator](https://arxiv.org/abs/2302.14036)\n" + ] + }, + { + "cell_type": "markdown", + "id": "2702d081-c675-4a96-8263-6059e310d048", + "metadata": {}, + "source": [ + "## Example: Finetuning ASR Model Using Text-Only Data" + ] + }, + { + "cell_type": "markdown", + "id": "30fe41a3-f36c-4803-a7f0-4260fb111478", + "metadata": {}, + "source": [ + "In this example, we will finetune a pretrained small Conformer-CTC model using text-only data from the AN4 dataset. [AN4 dataset](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/datasets.html#an4-dataset) is a small dataset that consists of sentences of people spelling out addresses, names, and other entities.\n", + "\n", + "The model is pretrained on LibriSpeech data and performs poorly on AN4 data (`~17.7%` WER on test data).\n", + "We will use only text from the train part to construct text-only training data for our model and will achieve a good performance on the test part of the AN4 dataset (`~2%` WER)." + ] + }, + { + "cell_type": "markdown", + "id": "923819bb-7822-412a-8f9b-98c76c70e0bb", + "metadata": {}, + "source": [ + "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run the following cell to set up dependencies.\n", + "\n", + "NOTE: The user is responsible for checking the content of datasets and the applicable licenses and determining if they are suitable for the intended use." + ] + }, + { + "cell_type": "markdown", + "id": "4685a9da-b3f8-4b95-ba74-64a114223233", + "metadata": {}, + "source": [ + "### Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d22d241-6c46-492c-99db-3bd69777243c", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import google.colab\n", + "\n", + " IN_COLAB = True\n", + "except (ImportError, ModuleNotFoundError):\n", + " IN_COLAB = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc38a961-8822-4685-89ae-ab6f591f9c28", + "metadata": {}, + "outputs": [], + "source": [ + "BRANCH = 'main'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd60b1c4-7b1d-421d-9d63-95d7458bbcbd", + "metadata": {}, + "outputs": [], + "source": [ + "# If you're using Google Colab and not running locally, run this cell.\n", + "\n", + "if IN_COLAB:\n", + " ## Install dependencies\n", + " !pip install wget\n", + " !apt-get install sox libsndfile1 ffmpeg\n", + " !pip install text-unidecode\n", + "\n", + " ## Install NeMo\n", + " !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" + ] + }, + { + "cell_type": "markdown", + "id": "08f99618-6f83-44b3-bc8e-f7df04fc471c", + "metadata": {}, + "source": [ + "### Import necessary libraries and utils" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74f780b1-9b72-4acf-bcf0-64e1ce84e76d", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "import string\n", + "import tempfile\n", + "\n", + "from omegaconf import OmegaConf\n", + "import pytorch_lightning as pl\n", + "import torch\n", + "from tqdm.auto import tqdm\n", + "import wget\n", + "\n", + "from nemo.collections.asr.models import EncDecCTCModelBPE\n", + "from nemo.collections.asr.models.hybrid_asr_tts_models import ASRWithTTSModel\n", + "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest\n", + "from nemo.collections.tts.models import FastPitchModel, SpectrogramEnhancerModel\n", + "from nemo.utils.notebook_utils import download_an4\n", + "\n", + "from nemo_text_processing.text_normalization.normalize import Normalizer" + ] + }, + { + "cell_type": "markdown", + "id": "ca928d36-fb0d-439b-bac0-299e98a72d02", + "metadata": {}, + "source": [ + "### Prepare Data" + ] + }, + { + "cell_type": "markdown", + "id": "702e8e92-17b2-4f34-a2d9-c72b94501bf5", + "metadata": {}, + "source": [ + "Download and preprocess AN4 data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62c7cfec-aa98-4fc5-8b31-23ee1d59f311", + "metadata": {}, + "outputs": [], + "source": [ + "DATASETS_DIR = Path(\"./datasets\") # directory for data\n", + "CHECKPOINTS_DIR = Path(\"./checkpoints/\") # directory for checkpoints" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "659db73e-dcd7-455c-8140-20e104d6ac00", + "metadata": {}, + "outputs": [], + "source": [ + "# create directories if necessary\n", + "DATASETS_DIR.mkdir(parents=True, exist_ok=True)\n", + "CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36830e7f-5293-4401-8c56-780127b47385", + "metadata": {}, + "outputs": [], + "source": [ + "download_an4(data_dir=f\"{DATASETS_DIR}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e77f5062-9acb-4f39-b811-a5b11dd6f76f", + "metadata": {}, + "outputs": [], + "source": [ + "AN4_DATASET = DATASETS_DIR / \"an4\"" + ] + }, + { + "cell_type": "markdown", + "id": "403b63b0-8aab-43aa-a455-31f588d1772f", + "metadata": {}, + "source": [ + "### Construct text-only training data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35654ee1-3869-4289-bd52-15818c0ccf69", + "metadata": {}, + "outputs": [], + "source": [ + "# read original training data\n", + "an4_train_data = read_manifest(AN4_DATASET / \"train_manifest.json\")" + ] + }, + { + "cell_type": "markdown", + "id": "a17f583c-2a5c-4faf-84bd-eb04c2921e01", + "metadata": {}, + "source": [ + "Text-only manifest should contain three fields:\n", + "- `text`: target text for the ASR model\n", + "- `tts_text`: text to use as a source for the TTS model (unnormalized)\n", + "- `tts_text_normalized`: text to use as a source for TTS model (normalized)\n", + "\n", + "If `tts_text_normalized` is not present, `tts_text` will be used, and normalization will be done when loading the dataset.\n", + "It is highly recommended to normalize the text and manually create the `tts_text_normalized` field since current normalizers are unsuitable for processing a large amount of text on the fly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5938a8c2-e239-4a45-a716-dc11a981aec7", + "metadata": {}, + "outputs": [], + "source": [ + "# fill `text` and `tts_text` fields with the source data\n", + "textonly_data = []\n", + "for record in an4_train_data:\n", + " text = record[\"text\"]\n", + " textonly_data.append({\"text\": text, \"tts_text\": text})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f6a5735-a5c2-4a8b-8116-bfc535a2c299", + "metadata": {}, + "outputs": [], + "source": [ + "WHITELIST_URL = (\n", + " \"https://raw.githubusercontent.com/NVIDIA/NeMo-text-processing/main/\"\n", + " \"nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv\"\n", + ")\n", + "\n", + "\n", + "def get_normalizer() -> Normalizer:\n", + " with tempfile.TemporaryDirectory() as data_dir:\n", + " whitelist_path = Path(data_dir) / \"lj_speech.tsv\"\n", + " if not whitelist_path.exists():\n", + " wget.download(WHITELIST_URL, out=str(data_dir))\n", + "\n", + " normalizer = Normalizer(\n", + " lang=\"en\",\n", + " input_case=\"cased\",\n", + " whitelist=str(whitelist_path),\n", + " overwrite_cache=True,\n", + " cache_dir=None,\n", + " )\n", + " return normalizer" + ] + }, + { + "cell_type": "markdown", + "id": "dd0253aa-d7f1-47ee-a142-099b71241270", + "metadata": {}, + "source": [ + "Сonstruct the `tts_text_normalized` field by applying an English normalizer to the text.\n", + "\n", + "AN4 data doesn't contain numbers, currency, and other entities, so the normalizer is used here only for demonstration purposes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27bb29d5-d44d-4026-98f8-5f0b1241b39a", + "metadata": {}, + "outputs": [], + "source": [ + "normalizer = get_normalizer()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9400e6d3-ba92-442a-8dd4-117e95dce2ea", + "metadata": {}, + "outputs": [], + "source": [ + "for record in tqdm(textonly_data):\n", + " record[\"tts_text_normalized\"] = normalizer.normalize(\n", + " record[\"tts_text\"], verbose=False, punct_pre_process=True, punct_post_process=True\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "30a934b0-9b58-4bad-bb9a-ab78d81c3859", + "metadata": {}, + "source": [ + "Save manifest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1833ac15-1750-4468-88bc-2343fbabe4d8", + "metadata": {}, + "outputs": [], + "source": [ + "write_manifest(AN4_DATASET / \"train_text_manifest.json\", textonly_data)" + ] + }, + { + "cell_type": "markdown", + "id": "fa3a2371-8c78-4dd1-9605-a668adf52b4a", + "metadata": {}, + "source": [ + "### Save pretrained checkpoints" + ] + }, + { + "cell_type": "markdown", + "id": "7eb14117-8b8b-4170-ab8c-ce496522a361", + "metadata": {}, + "source": [ + "Firstly we will load pretrained models from NGC and save them as `nemo` checkpoints. \n", + "Our hybrid model will be constructed from these checkpoints.\n", + "We will use:\n", + "- small Conformer-CTC ASR model trained on LibriSpeech data (for finetuning)\n", + "- multi-speaker TTS FastPitch model is trained on LibriTTS data. Spectrogram parameters for this model are the same as those used in the ASR model\n", + "- enhancer, which is trained adversarially on the output of the TTS model and natural spectrograms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43c5c75a-b6e0-4b3c-ad26-a07b483d84e6", + "metadata": {}, + "outputs": [], + "source": [ + "ASR_MODEL_PATH = CHECKPOINTS_DIR / \"stt_en_conformer_ctc_small_ls.nemo\"\n", + "TTS_MODEL_PATH = CHECKPOINTS_DIR / \"fastpitch.nemo\"\n", + "ENHANCER_MODEL_PATH = CHECKPOINTS_DIR / \"enhancer.nemo\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40976e22-7a7b-42b2-86a1-9eaaef4c1c22", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# asr model: stt_en_conformer_ctc_small_ls\n", + "asr_model = EncDecCTCModelBPE.from_pretrained(model_name=\"stt_en_conformer_ctc_small_ls\")\n", + "asr_model.save_to(f\"{ASR_MODEL_PATH}\")\n", + "\n", + "# tts model: tts_en_fastpitch_for_asr_finetuning\n", + "tts_model = FastPitchModel.from_pretrained(model_name=\"tts_en_fastpitch_for_asr_finetuning\")\n", + "tts_model.save_to(f\"{TTS_MODEL_PATH}\")\n", + "\n", + "# enhancer model: tts_en_spectrogram_enhancer_for_asr_finetuning\n", + "enhancer_model = SpectrogramEnhancerModel.from_pretrained(model_name=\"tts_en_spectrogram_enhancer_for_asr_finetuning\")\n", + "enhancer_model.save_to(f\"{ENHANCER_MODEL_PATH}\")" + ] + }, + { + "cell_type": "markdown", + "id": "32d1e242-0ab0-43bf-aaa0-997d284c2c1b", + "metadata": {}, + "source": [ + "### Construct hybrid ASR-TTS model " + ] + }, + { + "cell_type": "markdown", + "id": "2210eb07-6d44-44e0-a0ad-866f1e89873a", + "metadata": {}, + "source": [ + "#### Config Parameters\n", + "\n", + "`Hybrid ASR-TTS model` consists of three parts:\n", + "\n", + "* ASR model (``EncDecCTCModelBPE``, ``EncDecRNNTBPEModel`` or ``EncDecHybridRNNTCTCBPEModel``)\n", + "* TTS Mel Spectrogram Generator (currently, only `FastPitch` model is supported)\n", + "* Enhancer model (optional)\n", + "\n", + "Also, the config allows to specify a text-only dataset.\n", + "\n", + "Main parts of the config:\n", + "\n", + "* ASR model\n", + " * ``asr_model_path``: path to the ASR model checkpoint (`.nemo`) file, loaded only once, then the config of the ASR model is stored in the ``asr_model`` field\n", + " * ``asr_model_type``: needed only when training from scratch. ``rnnt_bpe`` corresponds to ``EncDecRNNTBPEModel``, ``ctc_bpe`` to ``EncDecCTCModelBPE``, ``hybrid_rnnt_ctc_bpe`` to ``EncDecHybridRNNTCTCBPEModel``\n", + " * ``asr_model_fuse_bn``: fusing BatchNorm in the pretrained ASR model, can improve quality in finetuning scenario\n", + "* TTS model\n", + " * ``tts_model_path``: path to the pretrained TTS model checkpoint (`.nemo`) file, loaded only once, then the config of the model is stored in the ``tts_model`` field\n", + "* Enhancer model\n", + " * ``enhancer_model_path``: optional path to the enhancer model. Loaded only once, the config is stored in the ``enhancer_model`` field\n", + "* ``train_ds``\n", + " * ``text_data``: properties related to text-only data\n", + " * ``manifest_filepath``: path (or paths) to text-only dataset manifests\n", + " * ``speakers_filepath``: path (or paths) to the text file containing speaker ids for the multi-speaker TTS model (speakers are sampled randomly during training)\n", + " * ``min_words`` and ``max_words``: parameters to filter text-only manifests by the number of words\n", + " * ``tokenizer_workers``: number of workers for initial tokenization (when loading the data). ``num_CPUs / num_GPUs`` is a recommended value.\n", + " * ``asr_tts_sampling_technique``, ``asr_tts_sampling_temperature``, ``asr_tts_sampling_probabilities``: sampling parameters for text-only and audio-text data (if both specified). Correspond to ``sampling_technique``, ``sampling_temperature``, and ``sampling_probabilities`` parameters of the `nemo.collections.common.data.dataset.ConcatDataset`.\n", + " * all other components are similar to conventional ASR models\n", + "* ``validation_ds`` and ``test_ds`` correspond to the underlying ASR model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d6dd499-d388-4ee3-9a01-d739b16e6ad7", + "metadata": {}, + "outputs": [], + "source": [ + "# load config\n", + "!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6701dc8-cb3b-44cc-aab5-fb6e2c1dadb5", + "metadata": {}, + "outputs": [], + "source": [ + "config = OmegaConf.load(\"./configs/hybrid_asr_tts.yaml\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c13b3c96-4074-415f-95d2-17569886bfcd", + "metadata": {}, + "outputs": [], + "source": [ + "NUM_EPOCHS = 10" + ] + }, + { + "cell_type": "markdown", + "id": "4d090c5d-44a7-401a-a753-b8779b1c1e0b", + "metadata": {}, + "source": [ + "We will use all available speakers (sampled uniformly)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c41e5e8-d926-4b83-8725-bae5a82121cf", + "metadata": {}, + "outputs": [], + "source": [ + "TTS_SPEAKERS_PATH = Path(\"./checkpoints/speakers.txt\")\n", + "\n", + "with open(TTS_SPEAKERS_PATH, \"w\", encoding=\"utf-8\") as f:\n", + " for speaker_id in range(tts_model.cfg.n_speakers):\n", + " print(speaker_id, file=f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c07c07c-cb15-4a1c-80bf-20eaffaa65d9", + "metadata": {}, + "outputs": [], + "source": [ + "config.model.asr_model_path = ASR_MODEL_PATH\n", + "config.model.tts_model_path = TTS_MODEL_PATH\n", + "config.model.enhancer_model_path = ENHANCER_MODEL_PATH\n", + "\n", + "# fuse BathNorm automatically in Conformer for better performance\n", + "config.model.asr_model_fuse_bn = True\n", + "\n", + "# training data\n", + "# constructed dataset\n", + "config.model.train_ds.text_data.manifest_filepath = str(AN4_DATASET / \"train_text_manifest.json\")\n", + "# speakers for TTS model\n", + "config.model.train_ds.text_data.speakers_filepath = f\"{TTS_SPEAKERS_PATH}\"\n", + "config.model.train_ds.manifest_filepath = None # audio-text pairs - we don't use them here\n", + "config.model.train_ds.batch_size = 8\n", + "\n", + "# validation data\n", + "config.model.validation_ds.manifest_filepath = str(AN4_DATASET / \"test_manifest.json\")\n", + "config.model.validation_ds.batch_size = 8\n", + "\n", + "config.trainer.max_epochs = NUM_EPOCHS\n", + "\n", + "config.trainer.devices = 1\n", + "config.trainer.strategy = None # use 1 device, no need for ddp strategy\n", + "\n", + "OmegaConf.resolve(config)" + ] + }, + { + "cell_type": "markdown", + "id": "8ae6cb2e-f571-4b53-8897-bb8ba0fc1146", + "metadata": {}, + "source": [ + "#### Construct trainer and ASRWithTTSModel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac4ae885-dec4-4ce9-8f69-a1f35d04b08c", + "metadata": {}, + "outputs": [], + "source": [ + "trainer = pl.Trainer(**config.trainer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f815762-b08d-4d3c-8fd3-61afa511eab4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "hybrid_model = ASRWithTTSModel(config.model)" + ] + }, + { + "cell_type": "markdown", + "id": "ca2c1bf2-28a9-4902-9c73-d96e04b21a46", + "metadata": {}, + "source": [ + "#### Validate the model\n", + "\n", + "Expect `~17.7%` WER on the AN4 test data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffa5f5c6-0609-4f46-aa0c-747319035417", + "metadata": {}, + "outputs": [], + "source": [ + "trainer.validate(hybrid_model)" + ] + }, + { + "cell_type": "markdown", + "id": "701ee9c7-91a1-4917-bf7d-ab26b625c7bf", + "metadata": {}, + "source": [ + "#### Train the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f79761c9-b882-4f14-911f-4a960ff81554", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "trainer.fit(hybrid_model)" + ] + }, + { + "cell_type": "markdown", + "id": "eac18c7c-bdcb-40ad-9c50-37f89fb4aa2a", + "metadata": {}, + "source": [ + "#### Validate the model after training\n", + "\n", + "Expect `~2%` WER on the AN4 test data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd927e87-13fb-4b61-8b4a-a6850780f605", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "trainer.validate(hybrid_model)" + ] + }, + { + "cell_type": "markdown", + "id": "6d25a77d-35ed-44b5-9ef5-318afa321acf", + "metadata": {}, + "source": [ + "### Save final model. Extract pure ASR model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f53ebd3-b89a-47e4-a0a5-ed3a3572f7c1", + "metadata": {}, + "outputs": [], + "source": [ + "# save full model: the model can be further used for finetuning\n", + "hybrid_model.save_to(\"checkpoints/finetuned_hybrid_model.nemo\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0560c2c-af28-4d8f-b36d-c18ec6a482a8", + "metadata": {}, + "outputs": [], + "source": [ + "# extract the resulting ASR model from the hybrid model\n", + "hybrid_model.save_asr_model_to(\"checkpoints/finetuned_asr_model.nemo\")" + ] + }, + { + "cell_type": "markdown", + "id": "2de58fbb-50be-42cd-9095-01cacfdb6931", + "metadata": {}, + "source": [ + "## Using Scritps (examples)" + ] + }, + { + "cell_type": "markdown", + "id": "86655198-b1fc-4615-958c-7c01f3cbd024", + "metadata": {}, + "source": [ + "`/examples/asr/asr_with_tts/` contains scripts for finetuning existing models and training new models from scratch." + ] + }, + { + "cell_type": "markdown", + "id": "b5837536-8280-475c-a581-caaee00edfca", + "metadata": {}, + "source": [ + "### Finetuning Existing Model" + ] + }, + { + "cell_type": "markdown", + "id": "84df9aeb-3b5e-41fc-a8d0-dfc660e71375", + "metadata": {}, + "source": [ + "To finetune existing ASR model using text-only data use `/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py` script with the corresponding config `/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml`.\n", + "\n", + "Please specify paths to all the required models (ASR, TTS, and Enhancer checkpoints), along with `train_ds.text_data.manifest_filepath` and `train_ds.text_data.speakers_filepath`." + ] + }, + { + "cell_type": "markdown", + "id": "78b9028c-02ce-4af4-b510-a431f4a2f62b", + "metadata": {}, + "source": [ + "```shell\n", + "python speech_to_text_bpe_with_text_finetune.py \\\n", + " model.asr_model_path= \\\n", + " model.tts_model_path= \\\n", + " model.enhancer_model_path= \\\n", + " model.asr_model_fuse_bn= \\\n", + " model.train_ds.manifest_filepath= \\\n", + " model.train_ds.text_data.manifest_filepath= \\\n", + " model.train_ds.text_data.speakers_filepath= \\\n", + " model.train_ds.text_data.tokenizer_workers=4 \\\n", + " model.validation_ds.manifest_filepath= \\\n", + " model.train_ds.batch_size=\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "0b17c097-a3b1-49a3-8f54-f07b94218d0b", + "metadata": {}, + "source": [ + "### Training a New Model from Scratch" + ] + }, + { + "cell_type": "markdown", + "id": "6d75b928-57b3-4180-bd09-37e018eef7ef", + "metadata": {}, + "source": [ + "```shell\n", + "python speech_to_text_bpe_with_text.py \\\n", + " # (Optional: --config-path= --config-name=) \\\n", + " ++asr_model_type= \\\n", + " ++tts_model_path= \\\n", + " ++enhancer_model_path= \\\n", + " model.tokenizer.dir= \\\n", + " model.tokenizer.type=\"bpe\" \\\n", + " model.train_ds.manifest_filepath= \\\n", + " ++model.train_ds.text_data.manifest_filepath= \\\n", + " ++model.train_ds.text_data.speakers_filepath= \\\n", + " ++model.train_ds.text_data.min_words=1 \\\n", + " ++model.train_ds.text_data.max_words=45 \\\n", + " ++model.train_ds.text_data.tokenizer_workers=4 \\\n", + " model.validation_ds.manifest_filepath= \\\n", + " model.train_ds.batch_size= \\\n", + " trainer.max_epochs= \\\n", + " trainer.num_nodes= \\\n", + " trainer.accumulate_grad_batches= \\\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "01c17712-ae8d-49cb-ade1-ded168676e27", + "metadata": {}, + "source": [ + "## Training TTS Models for ASR Finetuning" + ] + }, + { + "cell_type": "markdown", + "id": "422dc3b2-d29f-4ed0-b4d2-6d32b35dfb7b", + "metadata": {}, + "source": [ + "### TTS Model (FastPitch)\n", + "\n", + "TTS model for the purpose of ASR model finetuning should be trained with the same mel spectrogram parameters as used in the ASR model. The typical parameters are `10ms` hop length, `25ms` window length, and the highest band of 8kHz (for 16kHz data). Other parameters are the same as for common multi-speaker TTS models.\n", + "\n", + "Mainly we observed two differences specific to TTS models for ASR:\n", + "- adding more speakers and more data improves the final ASR model quality (but not the perceptual quality of the TTS model)\n", + "- training for more epochs can also improve the quality of the ASR system (but MSE loss used for the TTS model can be higher than optimal on validation data)\n", + "\n", + "Use script `/examples/tts/fastpitch.py` to train a FastPitch model.\n", + "More details about the FastPitch model can be found in the [documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/tts/models.html#fastpitch). \n", + "\n", + "### Enhancer\n", + "Use script `/examples/tts/spectrogram_enhancer.py` to train an Enhancer model. More details can be found in the \n", + "[documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/tts/models.html).\n", + "\n", + "### Models Used in This Tutorial\n", + "\n", + "Some details about the models used in this tutorial can be found on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_en_fastpitch_spectrogram_enhancer_for_asr_finetuning).\n", + "\n", + "The system is also described in detail in the paper in the paper [Text-only domain adaptation for end-to-end ASR using integrated text-to-mel-spectrogram generator](https://arxiv.org/abs/2302.14036)." + ] + }, + { + "cell_type": "markdown", + "id": "9a9a6cd3-4bdc-4b6e-b4b1-3bfd50fd01b3", + "metadata": {}, + "source": [ + "## Summary" + ] + }, + { + "cell_type": "markdown", + "id": "e2890c61-e4b7-47aa-a086-bc483ae7141f", + "metadata": {}, + "source": [ + "The tutorial demonstrated the main concepts related to hybrid ASR-TTS models to finetune ASR models and train new ones from scratch. \n", + "The ability to achieve good text-only adaptation results is demonstrated by finetuning a small Conformer model on text-only data from the AN4 dataset." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ml38", + "language": "python", + "name": "ml38" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From f7e33fc1a0dad23109ac81d824006350f6ad2b0b Mon Sep 17 00:00:00 2001 From: Markel Sanz Ausin Date: Thu, 13 Jul 2023 12:15:44 -0700 Subject: [PATCH 107/123] Add end_strings to SamplingParams (#6986) * Add end_strings to SamplingParams Signed-off-by: Gerald Shen * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Gerald Shen * Add end_strings to megatron_gpt_inference.yaml Signed-off-by: Gerald Shen * Add end_strings to sampling params Signed-off-by: Gerald Shen * Remove extra_id_1 from default end_strings Signed-off-by: Gerald Shen * Fix require_grad typos (#6930) Signed-off-by: Sergii Dymchenko Signed-off-by: Gerald Shen * fix syntax error Signed-off-by: Gerald Shen * fix the mpt chatbot (#6957) (#6968) Signed-off-by: Yi Dong Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com> Signed-off-by: Gerald Shen * add support for max_total_length=4096 for 43b (#6763) * add support for max_total_length=4096 for 43b Signed-off-by: Zhilin Wang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Zhilin Wang Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Gerald Shen * rnnt_greedy_decoding.py: typos? auto-repressively -> auto-regressively (#6989) Signed-off-by: Vadim Kantorov Signed-off-by: Gerald Shen * Cache handling without input tensors mutation (#6980) (#6996) * Cache handling without input tensors mutation * Cleanup * Cleanup#2 * Cleanup#3 --------- Signed-off-by: Boris Fomitchev Co-authored-by: Boris Fomitchev Co-authored-by: Somshubra Majumdar Signed-off-by: Gerald Shen * Hybrid conformer export (#6983) (#6995) * Implemented generic kv-pair setting of export_config from args * Hybrid conformer export * Hybrid decoder export * Cleanup * Changed from **kwargs * Docstring * Docs added * Stringify args * Added docs for ASR export configs * lowercase ctc --------- Signed-off-by: Boris Fomitchev Co-authored-by: Boris Fomitchev Signed-off-by: Gerald Shen * Fixing an issue with confidence ensembles (#6987) (#7004) * Bug fix for the confidence ensembles * Relax constraints for the test --------- Signed-off-by: Igor Gitman Co-authored-by: Igor Gitman Signed-off-by: Gerald Shen * [TTS] Add cosine distance option to TTS aligner (#6806) * [TTS] Add cosine distance option to TTS aligner Signed-off-by: Ryan * [TTS] Update aligner comments Signed-off-by: Ryan --------- Signed-off-by: Ryan Signed-off-by: Gerald Shen * Minor MPT-7B fixes and creation script update (#6982) * Initial commit of minor MPT-7B fixes Signed-off-by: Daniel Egert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Daniel Egert Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Gerald Shen * Change Jenkins timeout (#6997) * change timeout Signed-off-by: ericharper * change to 8 hours Signed-off-by: ericharper --------- Signed-off-by: ericharper Signed-off-by: Gerald Shen * remove hard coded input and output fields (#7008) * remove hard coded input and output fields Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Gerald Shen * RoPE length extrapolation with interpolation (#7005) * Push changes Signed-off-by: MaximumEntropy * Fixes Signed-off-by: MaximumEntropy * add continue training script Signed-off-by: MaximumEntropy * [WIP] nonlinear interp Signed-off-by: MaximumEntropy * Fix Signed-off-by: MaximumEntropy * override encoder_seq_len Signed-off-by: MaximumEntropy * Remove nonlinear Signed-off-by: MaximumEntropy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * sft with pi (#7006) * sft with pi Signed-off-by: Evelina * update values only if not None" Signed-off-by: Evelina --------- Signed-off-by: Evelina * Address comments Signed-off-by: MaximumEntropy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add info Signed-off-by: MaximumEntropy * Empty Signed-off-by: MaximumEntropy --------- Signed-off-by: MaximumEntropy Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> Signed-off-by: Gerald Shen * use proper config Signed-off-by: Gerald Shen * Add end_strings to SamplingParams Signed-off-by: Gerald Shen * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Gerald Shen * Add end_strings to megatron_gpt_inference.yaml Signed-off-by: Gerald Shen * Add end_strings to sampling params Signed-off-by: Gerald Shen * Remove extra_id_1 from default end_strings Signed-off-by: Gerald Shen * fix syntax error Signed-off-by: Gerald Shen * use proper config Signed-off-by: Gerald Shen --------- Signed-off-by: Gerald Shen Signed-off-by: Sergii Dymchenko Signed-off-by: Yi Dong Signed-off-by: Zhilin Wang Signed-off-by: Vadim Kantorov Signed-off-by: Boris Fomitchev Signed-off-by: Igor Gitman Signed-off-by: Ryan Signed-off-by: Daniel Egert Signed-off-by: ericharper Signed-off-by: arendu Signed-off-by: MaximumEntropy Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Sergii Dymchenko Co-authored-by: Gerald Shen Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com> Co-authored-by: Zhilin Wang Co-authored-by: Vadim Kantorov Co-authored-by: Boris Fomitchev Co-authored-by: Somshubra Majumdar Co-authored-by: Igor Gitman Co-authored-by: Ryan Langman Co-authored-by: trias702 <25867060+trias702@users.noreply.github.com> Co-authored-by: Eric Harper Co-authored-by: Adi Renduchintala Co-authored-by: Sandeep Subramanian Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> --- .../conf/megatron_gpt_inference.yaml | 2 +- .../nlp/language_modeling/megatron_gpt_eval.py | 1 + .../megatron_gpt_prompt_learning_model.py | 2 ++ .../megatron_gpt_sft_model.py | 1 + .../modules/common/text_generation_server.py | 18 +++++++++--------- .../modules/common/text_generation_utils.py | 9 ++++++--- .../common/transformer/text_generation.py | 2 ++ tests/collections/nlp/test_gpt_eval.py | 1 + 8 files changed, 23 insertions(+), 13 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml index 53d4e9b7e82b..b5b053fc1549 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml @@ -9,7 +9,7 @@ inference: repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - + end_strings: ["<|endoftext|>"] # generation will stop when one of these tokens is generated trainer: devices: 1 diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index 2a6890e1a9b4..76e68d24bae8 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -267,6 +267,7 @@ def main(cfg) -> None: "add_BOS": cfg.inference.add_BOS, "all_probs": cfg.inference.all_probs, "compute_logprob": cfg.inference.compute_logprob, + "end_strings": cfg.inference.end_strings, } fp8_enabled = hasattr(model.cfg, "fp8") and (model.cfg.fp8 == True) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py index 81ca1c283ad0..d14466dd18ee 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py @@ -217,6 +217,7 @@ def init_model(self, cfg: DictConfig, trainer: Trainer): "add_BOS": True, "all_probs": False, "compute_logprob": False, + "end_strings": self.cfg.inference.get('end_strings', ["<|endoftext|>"]), } elif self.cfg.get("report_validation_metric", False) and not hasattr(self.cfg, 'inference'): raise ValueError("Must provide inference parameters for reporting validation metric!") @@ -754,6 +755,7 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] "all_probs": inference_config["all_probs"], "compute_logprob": inference_config["compute_logprob"], "compute_attention_mask": inference_config.get("compute_attention_mask", True), + "end_strings": inference_config.get('end_strings', ["<|endoftext|>"]), } task_ids, processed_inputs = batch diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index c80d2272613e..c390a8c440bf 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -390,6 +390,7 @@ def inference_step(self, dataloader_iter, batch_idx, mode, dataloader_idx=0): "add_BOS": False, "all_probs": False, "compute_logprob": False, + "end_strings": ["<|endoftext|>"], } result = megatron_gpt_generate( model=self, diff --git a/nemo/collections/nlp/modules/common/text_generation_server.py b/nemo/collections/nlp/modules/common/text_generation_server.py index 5eb69eefcc3e..a9d3b2097af7 100644 --- a/nemo/collections/nlp/modules/common/text_generation_server.py +++ b/nemo/collections/nlp/modules/common/text_generation_server.py @@ -141,6 +141,14 @@ def put(self): if not (1.0 <= repetition_penalty): return "repetition_penalty must be a positive number no less than 1.0" + end_strings = ['<|endoftext|>'] + if 'end_strings' in request.get_json(): + end_strings = request.get_json()['end_strings'] + if not isinstance(end_strings, list): + return "expect end_strings to be a list of strings" + if not all([isinstance(s, str) for s in end_strings]): + return "expect end_strings to be a list of strings" + min_tokens_to_generate = 0 if "min_tokens_to_generate" in request.get_json(): min_tokens_to_generate = request.get_json()["min_tokens_to_generate"] @@ -157,14 +165,6 @@ def put(self): if neighbors < 0: return "num of neighbors must be an integer no less than 0" - end_strings = ['<|endoftext|>'] - if 'end_strings' in request.get_json(): - end_strings = request.get_json()['end_strings'] - if not isinstance(end_strings, list): - return "expect end_strings to be a list of strings" - if not all([isinstance(s, str) for s in end_strings]): - return "expect end_strings to be a list of strings" - with lock: # Need to get lock to keep multiple threads from hitting code MegatronGenerate.send_do_generate() # Tell other ranks we're doing generate extra = {} @@ -190,8 +190,8 @@ def put(self): top_p, greedy, repetition_penalty, - min_tokens_to_generate, end_strings=end_strings, + min_tokens_to_generate=min_tokens_to_generate, **extra, ) for k in output: diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py index d84d16efb5ba..545ea5cb346c 100644 --- a/nemo/collections/nlp/modules/common/text_generation_utils.py +++ b/nemo/collections/nlp/modules/common/text_generation_utils.py @@ -69,6 +69,7 @@ def get_default_sampling_params(): "add_BOS": True, "all_probs": False, "compute_logprob": False, + "end_strings": ["<|endoftext|>", ""], } return sampling_params @@ -104,6 +105,7 @@ def megatron_gpt_generate(model, inputs, tokenizer, length_params, sampling_para top_p=sampling_params['top_p'], greedy=sampling_params['use_greedy'], repetition_penalty=sampling_params['repetition_penalty'], + end_strings=sampling_params['end_strings'], min_tokens_to_generate=length_params['min_length'], compute_attention_mask=sampling_params.get("compute_attention_mask", True), **strategy_args, @@ -125,6 +127,7 @@ def megatron_gpt_generate(model, inputs, tokenizer, length_params, sampling_para top_p=sampling_params['top_p'], greedy=sampling_params['use_greedy'], repetition_penalty=sampling_params['repetition_penalty'], + end_strings=sampling_params['end_strings'], min_tokens_to_generate=length_params['min_length'], **strategy_args, ) @@ -380,8 +383,8 @@ def synced_generate( compute_attention_mask=True, compute_logprob=False, repetition_penalty=1.2, - min_tokens_to_generate=0, end_strings=[], + min_tokens_to_generate=0, ): context_length = context_length_tensor.min().item() tokenizer = model.tokenizer @@ -475,8 +478,8 @@ def generate( compute_attention_mask=True, compute_logprob=False, repetition_penalty=1.0, - min_tokens_to_generate=0, end_strings=['<|endoftext|>'], + min_tokens_to_generate=0, **strategy_args, ) -> OutputType: """ @@ -560,8 +563,8 @@ def generate( top_p=top_p, greedy=greedy, repetition_penalty=repetition_penalty, - min_tokens_to_generate=min_tokens_to_generate, end_strings=end_strings, + min_tokens_to_generate=min_tokens_to_generate, ) special_tokens = set() if hasattr(tokenizer, 'pad_token') and tokenizer.pad_token is not None: diff --git a/nemo/collections/nlp/modules/common/transformer/text_generation.py b/nemo/collections/nlp/modules/common/transformer/text_generation.py index a261e925691f..28db41b8a27a 100644 --- a/nemo/collections/nlp/modules/common/transformer/text_generation.py +++ b/nemo/collections/nlp/modules/common/transformer/text_generation.py @@ -37,6 +37,7 @@ class SamplingParam(TypedDict): add_BOS: bool # add the bos token at the begining of the prompt all_probs: bool # whether return the log prob for all the tokens in vocab compute_logprob: bool # a flag used to compute logprob of all the input text, a very special case of running inference, default False + end_strings: List[str] # generation will stop when one of these tokens is generated class OutputType(TypedDict): @@ -88,6 +89,7 @@ def generate( add_BOS: bool, Whether add the bos token at the begining of the prompt all_probs: bool # whether return the log prob for all the tokens in vocab compute_logprob: bool # a flag used to compute logprob of all the input text, a very special case of running inference, default False + end_strings: List[str] # generation will stop when one of these tokens is generated Default None, If it is None, use_greedy will be "True". Returns: OutputType: It generates the output in a dictionary type. It has the following keys: diff --git a/tests/collections/nlp/test_gpt_eval.py b/tests/collections/nlp/test_gpt_eval.py index 0e64b989176f..fb3f9fda5ac3 100644 --- a/tests/collections/nlp/test_gpt_eval.py +++ b/tests/collections/nlp/test_gpt_eval.py @@ -78,6 +78,7 @@ def test_gpt_eval(self): "add_BOS": True, "all_probs": False, "compute_logprob": False, + "end_strings": ["<|endoftext|>"], } # test logprob From 573397549a3755649ab2476b50684ea7f415205e Mon Sep 17 00:00:00 2001 From: Kim Ngo <6362111+findkim@users.noreply.github.com> Date: Thu, 13 Jul 2023 15:25:03 -0500 Subject: [PATCH 108/123] Fix race condition when executing with multi-node where some ranks does not wait for setup (#7016) Signed-off-by: Kim Ngo <6362111+findkim@users.noreply.github.com> --- .../modules/common/megatron/megatron_utils.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_utils.py b/nemo/collections/nlp/modules/common/megatron/megatron_utils.py index d901a00a343b..68437921f930 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_utils.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_utils.py @@ -14,13 +14,14 @@ # limitations under the License. import os +import shutil from typing import Dict, List import torch import wget from torch.hub import _get_torch_home -from nemo.utils import logging +from nemo.utils import get_rank, logging __all__ = [ "get_megatron_lm_model", @@ -202,16 +203,14 @@ def _download(path: str, url: str): if url is None: return None - if not os.path.exists(path): - master_device = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0 - if not os.path.exists(path): - if master_device: - os.makedirs(MEGATRON_CACHE, exist_ok=True) - logging.info(f"Downloading from {url}") - wget.download(url, path) - # wait until the master process downloads the file and writes it to the cache dir - if torch.distributed.is_initialized(): - torch.distributed.barrier() + if get_rank.is_global_rank_zero() and not os.path.exists(path): + os.makedirs(MEGATRON_CACHE, exist_ok=True) + logging.info(f"Downloading from {url} to {path}") + downloaded_path = wget.download(url) + shutil.move(downloaded_path, path) + # wait until the master process downloads the file and writes it to the cache dir + if torch.distributed.is_initialized(): + torch.distributed.barrier() return path From 470f178f7f6a7f166761d14aa647168cc0dd157c Mon Sep 17 00:00:00 2001 From: tbartley94 <90423858+tbartley94@users.noreply.github.com> Date: Fri, 14 Jul 2023 00:57:09 -0400 Subject: [PATCH 109/123] Added bool types to neural_types export (#7032) Signed-off-by: tbartley94 --- nemo/core/neural_types/elements.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/core/neural_types/elements.py b/nemo/core/neural_types/elements.py index 10638a9c461a..f2de48da26d0 100644 --- a/nemo/core/neural_types/elements.py +++ b/nemo/core/neural_types/elements.py @@ -21,6 +21,7 @@ __all__ = [ 'ElementType', 'VoidType', + 'BoolType', 'ChannelType', 'AcousticEncodedRepresentation', 'AudioSignal', From e859e43ef85cc6bcdde697f634bb3b16ee16bc6b Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 14 Jul 2023 13:24:14 +0400 Subject: [PATCH 110/123] rnnt and char utils (#6971) * rnnt_ngram_merge Signed-off-by: Nikolay Karpov * char level bug Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Nikolay Karpov Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Somshubra Majumdar --- .../ngram_lm/kenlm_utils.py | 18 ++-- .../ngram_lm/ngram_merge.py | 83 +++++++++---------- 2 files changed, 49 insertions(+), 52 deletions(-) diff --git a/scripts/asr_language_modeling/ngram_lm/kenlm_utils.py b/scripts/asr_language_modeling/ngram_lm/kenlm_utils.py index 9e255ddc50ca..d9b48afab292 100644 --- a/scripts/asr_language_modeling/ngram_lm/kenlm_utils.py +++ b/scripts/asr_language_modeling/ngram_lm/kenlm_utils.py @@ -79,11 +79,8 @@ def setup_tokenizer(nemo_model_file): ) model = nemo_asr.models.ASRModel.from_pretrained(nemo_model_file, map_location=torch.device('cpu')) - if type(model.tokenizer).__name__ == 'AggregateTokenizer': - is_aggregate_tokenizer = True - else: - is_aggregate_tokenizer = False - + is_aggregate_tokenizer = False + tokenizer_nemo = None encoding_level = SUPPORTED_MODELS.get(type(model).__name__, None) if not encoding_level: logging.warning( @@ -91,7 +88,12 @@ def setup_tokenizer(nemo_model_file): ) encoding_level = 'char' - tokenizer_nemo = model.tokenizer + if encoding_level == 'subword': + if type(model.tokenizer).__name__ == 'AggregateTokenizer': + is_aggregate_tokenizer = True + + tokenizer_nemo = model.tokenizer + del model return tokenizer_nemo, encoding_level, is_aggregate_tokenizer @@ -117,10 +119,10 @@ def iter_files(source_path, dest_path, tokenizer, encoding_level, is_aggregate_t if isinstance(dest_path, str): with open(dest_path, 'w', encoding='utf-8') as f: for line in dataset: - f.write(line + "\n") + f.write(line[0] + "\n") else: # write to stdin of KenLM for line in dataset: - dest_path.write((line + '\n').encode()) + dest_path.write((line[0] + '\n').encode()) def read_train_file( diff --git a/scripts/asr_language_modeling/ngram_lm/ngram_merge.py b/scripts/asr_language_modeling/ngram_lm/ngram_merge.py index abffc6372518..b6606286ae5b 100644 --- a/scripts/asr_language_modeling/ngram_lm/ngram_merge.py +++ b/scripts/asr_language_modeling/ngram_lm/ngram_merge.py @@ -51,6 +51,7 @@ import torch import nemo.collections.asr as nemo_asr +from nemo.collections.asr.modules.rnnt import RNNTDecoder from nemo.collections.asr.parts.submodules.ctc_beam_decoding import DEFAULT_TOKEN_OFFSET from nemo.utils import logging @@ -207,9 +208,7 @@ def make_arpa(self, ngram_mod: str, ngram_arpa: str, force: bool): ] return subprocess.run(sh_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr,) - def test_perplexity( - self, mod_c: str, symbols: str, test_txt: str, nemo_model_file: str, tmp_path: str, force: bool - ) -> str: + def test_perplexity(self, mod_c: str, symbols: str, test_txt: str, nemo_model_file: str, tmp_path: str) -> str: """ Tests the perplexity of a given ngram model on a test file. @@ -229,12 +228,12 @@ def test_perplexity( 'Perplexity: 123.45' """ - test_far = farcompile(symbols, test_txt, tmp_path, nemo_model_file, force) + test_far = farcompile(symbols, test_txt, tmp_path, nemo_model_file) res_p = self.perplexity(mod_c, test_far) return res_p -def farcompile(symbols: str, text_file: str, tmp_path: str, nemo_model_file: str, force: bool,) -> str: +def farcompile(symbols: str, text_file: str, tmp_path: str, nemo_model_file: str) -> str: """ Compiles a text file into a FAR file using the given symbol table or tokenizer. @@ -253,43 +252,35 @@ def farcompile(symbols: str, text_file: str, tmp_path: str, nemo_model_file: str """ test_far = os.path.join(tmp_path, os.path.split(text_file)[1] + ".far") - if os.path.isfile(test_far) and not force: - logging.info("File " + test_far + " exists. Skipping.") - return None - else: - sh_args = [ - "farcompilestrings", - "--generate_keys=10", - "--fst_type=compact", - "--symbols=" + symbols, - "--keep_symbols", - ">", - test_far, - ] - - tokenizer, encoding_level, is_aggregate_tokenizer = kenlm_utils.setup_tokenizer(nemo_model_file) - - ps = subprocess.Popen( - " ".join(sh_args), shell=True, stdin=subprocess.PIPE, stdout=sys.stdout, stderr=sys.stderr, - ) - - kenlm_utils.iter_files( - source_path=[text_file], - dest_path=ps.stdin, - tokenizer=tokenizer, - encoding_level=encoding_level, - is_aggregate_tokenizer=is_aggregate_tokenizer, - verbose=1, - ) - stdout, stderr = ps.communicate() + sh_args = [ + "farcompilestrings", + "--generate_keys=10", + "--fst_type=compact", + "--symbols=" + symbols, + "--keep_symbols", + ">", + test_far, + ] + + tokenizer, encoding_level, is_aggregate_tokenizer = kenlm_utils.setup_tokenizer(nemo_model_file) + + ps = subprocess.Popen(" ".join(sh_args), shell=True, stdin=subprocess.PIPE, stdout=sys.stdout, stderr=sys.stderr,) + + kenlm_utils.iter_files( + source_path=[text_file], + dest_path=ps.stdin, + tokenizer=tokenizer, + encoding_level=encoding_level, + is_aggregate_tokenizer=is_aggregate_tokenizer, + verbose=1, + ) + stdout, stderr = ps.communicate() - exit_code = ps.returncode + exit_code = ps.returncode - command = " ".join(sh_args) - assert ( - exit_code == 0 - ), f"Exit_code must be 0.\n bash command: {command} \n stdout: {stdout} \n stderr: {stderr}" - return test_far + command = " ".join(sh_args) + assert exit_code == 0, f"Exit_code must be 0.\n bash command: {command} \n stdout: {stdout} \n stderr: {stderr}" + return test_far def make_kenlm(kenlm_bin_path: str, ngram_arpa: str, force: bool): @@ -310,7 +301,7 @@ def make_kenlm(kenlm_bin_path: str, ngram_arpa: str, force: bool): logging.info("File " + ngram_kenlm + " exists. Skipping.") return None else: - sh_args = [kenlm_bin_path, "trie", "-i", ngram_arpa, ngram_kenlm] + sh_args = [os.path.join(kenlm_bin_path, "build_binary"), "trie", "-i", ngram_arpa, ngram_kenlm] return subprocess.run(sh_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr,) @@ -336,12 +327,15 @@ def make_symbol_list(nemo_model_file, symbols, force): else: if nemo_model_file.endswith('.nemo'): asr_model = nemo_asr.models.ASRModel.restore_from(nemo_model_file, map_location=torch.device('cpu')) - vocab_size = len(asr_model.decoder.vocabulary) else: logging.warning( "nemo_model_file does not end with .nemo, therefore trying to load a pretrained model with this name." ) asr_model = nemo_asr.models.ASRModel.from_pretrained(nemo_model_file, map_location=torch.device('cpu')) + + if isinstance(asr_model.decoder, RNNTDecoder): + vocab_size = asr_model.decoder.blank_idx + else: vocab_size = len(asr_model.decoder.vocabulary) vocab = [chr(idx + DEFAULT_TOKEN_OFFSET) for idx in range(vocab_size)] @@ -389,8 +383,9 @@ def main( if not symbols: symbols = os.path.join(out_path, os.path.split(nemo_model_file)[1] + ".syms") make_symbol_list(nemo_model_file, symbols, force) - test_p = nm.test_perplexity(mod_c, symbols, test_file, nemo_model_file, out_path, force) - logging.info("Perplexity summary:" + test_p) + for test_f in test_file.split(","): + test_p = nm.test_perplexity(mod_c, symbols, test_f, nemo_model_file, out_path) + logging.info("Perplexity summary " + test_f + " : " + test_p) logging.info("Making ARPA and Kenlm model " + arpa_c) out = nm.make_arpa(mod_c, arpa_c, force) From 18f283efcb855cb8797fc2c3c9fafd4567136f8e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 14 Jul 2023 08:40:05 -0400 Subject: [PATCH 111/123] fix tab text gen (#7022) (#7031) Signed-off-by: Yi Dong Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com> --- .../collections/nlp/modules/common/text_generation_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py index 545ea5cb346c..36b30aae47b9 100644 --- a/nemo/collections/nlp/modules/common/text_generation_utils.py +++ b/nemo/collections/nlp/modules/common/text_generation_utils.py @@ -396,6 +396,7 @@ def synced_generate( context_length_tensor, tokens_to_generate, all_probs, + compute_attention_mask=compute_attention_mask, temperature=temperature, ) else: @@ -825,6 +826,7 @@ def tab_sample_sequence_batch( context_lengths, tokens_to_generate, all_probs=True, + compute_attention_mask=True, type_ids=None, temperature=None, ): @@ -848,7 +850,7 @@ def tab_sample_sequence_batch( # initialize the batch with torch.no_grad(): context_length = context_lengths.min().item() - inference_strategy.init_batch(context_tokens, context_length) + inference_strategy.init_batch(context_tokens, context_length, compute_attention_mask) context = context_tokens[:, :context_length] # the context may start in the middle of the row, # calculate the offset according to the position of '\n' or '<|endoftext|>' @@ -882,7 +884,7 @@ def tab_sample_sequence_batch( while context_length < maxlen: batch, tensor_shape = inference_strategy.prepare_batch_at_step( - tokens, maxlen, micro_batch_size, counter, context_length + tokens, maxlen, micro_batch_size, counter, context_length, compute_attention_mask ) output = inference_strategy.forward_step(batch, tensor_shape) From 33100e0ad3c504eb0b402bc7e3ba2aa0479913c5 Mon Sep 17 00:00:00 2001 From: Aleksandr Laptev Date: Sun, 16 Jul 2023 02:03:45 +0700 Subject: [PATCH 112/123] ASR Confidence update and tutorial (#6810) * small fixes and tests Signed-off-by: Aleksandr Laptev * various fixes for the tutorial Signed-off-by: Aleksandr Laptev * tutorial added Signed-off-by: Aleksandr Laptev * for for a little oops after rebasement Signed-off-by: Aleksandr Laptev * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix tests Signed-off-by: Aleksandr Laptev * unused import removed Signed-off-by: Aleksandr Laptev * fix review comments Signed-off-by: Aleksandr Laptev * deprecated parameters for greedy configs Signed-off-by: Aleksandr Laptev * move re-assigning to configs Signed-off-by: Aleksandr Laptev * fix comments 2 Signed-off-by: Aleksandr Laptev * fix config tests Signed-off-by: Aleksandr Laptev * fix ece test (my env was bugged apparently) Signed-off-by: Aleksandr Laptev * renamings for confidence ensemble Signed-off-by: Aleksandr Laptev * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fox comments 3 Signed-off-by: Aleksandr Laptev * return dropped tutorial Signed-off-by: Aleksandr Laptev * CI flips back and forth, increasing tolerance Signed-off-by: Aleksandr Laptev --------- Signed-off-by: Aleksandr Laptev Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/source/starthere/tutorials.rst | 6 + nemo/collections/asr/metrics/rnnt_wer.py | 70 +- nemo/collections/asr/metrics/rnnt_wer_bpe.py | 21 +- nemo/collections/asr/metrics/wer.py | 146 +- nemo/collections/asr/metrics/wer_bpe.py | 20 +- .../asr/models/confidence_ensemble.py | 12 +- .../parts/submodules/ctc_greedy_decoding.py | 55 +- .../parts/submodules/rnnt_greedy_decoding.py | 252 +-- .../asr_confidence_benchmarking_utils.py | 183 +++ .../asr/parts/utils/asr_confidence_utils.py | 251 ++- .../asr/parts/utils/confidence_metrics.py | 202 ++- .../confidence_ensembles/build_ensemble.py | 17 +- .../confidence_ensembles/ensemble_config.yaml | 6 +- .../confidence/benchmark_asr_confidence.py | 173 +- .../asr/confidence/test_asr_confidence.py | 144 ++ .../confidence/test_asr_confidence_metrics.py | 115 ++ .../test_asr_confidence_primitives.py | 142 ++ .../test_asr_hybrid_rnnt_ctc_model_char.py | 6 +- tests/collections/asr/test_asr_metrics.py | 10 + .../asr/test_asr_rnnt_encdec_model.py | 6 +- .../asr/test_confidence_ensembles.py | 16 +- tutorials/asr/ASR_Confidence_Estimation.ipynb | 1432 +++++++++++++++++ tutorials/asr/Confidence_Ensembles.ipynb | 2 +- 23 files changed, 2836 insertions(+), 451 deletions(-) create mode 100644 nemo/collections/asr/parts/utils/asr_confidence_benchmarking_utils.py create mode 100644 tests/collections/asr/confidence/test_asr_confidence.py create mode 100644 tests/collections/asr/confidence/test_asr_confidence_metrics.py create mode 100644 tests/collections/asr/confidence/test_asr_confidence_primitives.py create mode 100644 tutorials/asr/ASR_Confidence_Estimation.ipynb diff --git a/docs/source/starthere/tutorials.rst b/docs/source/starthere/tutorials.rst index 2f4ea59cff5c..586ce46c0c38 100644 --- a/docs/source/starthere/tutorials.rst +++ b/docs/source/starthere/tutorials.rst @@ -109,6 +109,12 @@ To run a tutorial: * - ASR - Hybrid ASR-TTS Models Tutorial - `Multi-lingual ASR `_ + * - ASR + - ASR Confidence Estimation + - `ASR Confidence Estimation `_ + * - ASR + - Confidence-based Ensembles + - `Confidence-based Ensembles `_ * - NLP - Using Pretrained Language Models for Downstream Tasks - `Pretrained Language Models for Downstream Tasks `_ diff --git a/nemo/collections/asr/metrics/rnnt_wer.py b/nemo/collections/asr/metrics/rnnt_wer.py index 7e5636191a1d..87a48e50d58a 100644 --- a/nemo/collections/asr/metrics/rnnt_wer.py +++ b/nemo/collections/asr/metrics/rnnt_wer.py @@ -100,32 +100,33 @@ class AbstractRNNTDecoding(ConfidenceMixin): from the `token_confidence`. aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. Valid options are `mean`, `min`, `max`, `prod`. - method_cfg: A dict-like object which contains the method name and settings to compute per-frame + measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame confidence scores. - name: The method name (str). + name: The measure name (str). Supported values: - 'max_prob' for using the maximum token probability as a confidence. - 'entropy' for using a normalized entropy of a log-likelihood vector. entropy_type: Which type of entropy to use (str). - Used if confidence_method_cfg.name is set to `entropy`. + Used if confidence_measure_cfg.name is set to `entropy`. Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renui' for the Rényi entropy. + - 'renyi' for the Rényi entropy. Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the temperature equals one, scaling is not applied to 'max_prob', + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) entropy_norm: A mapping of the entropy value to the interval [0,1]. @@ -139,7 +140,7 @@ class AbstractRNNTDecoding(ConfidenceMixin): timestep during greedy decoding. Setting to larger values allows longer sentences to be decoded, at the cost of increased execution time. preserve_frame_confidence: Same as above, overrides above value. - confidence_method: Same as above, overrides confidence_cfg.method. + confidence_measure_cfg: Same as above, overrides confidence_cfg.measure_cfg. "beam": beam_size: int, defining the beam size for beam search. Must be >= 1. @@ -255,15 +256,13 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int): # initialize confidence-related fields self._init_confidence(self.cfg.get('confidence_cfg', None)) - # Update preserve frame confidence - if self.preserve_frame_confidence is False: - if self.cfg.strategy in ['greedy', 'greedy_batch']: - self.preserve_frame_confidence = self.cfg.greedy.get('preserve_frame_confidence', False) - self.confidence_method_cfg = self.cfg.greedy.get('confidence_method_cfg', None) - - elif self.cfg.strategy in ['beam', 'tsd', 'alsd', 'maes']: - # Not implemented - pass + # Confidence estimation is not implemented for these strategies + if ( + not self.preserve_frame_confidence + and self.cfg.strategy in ['beam', 'tsd', 'alsd', 'maes'] + and self.cfg.beam.get('preserve_frame_confidence', False) + ): + raise NotImplementedError(f"Confidence calculation is not supported for strategy `{self.cfg.strategy}`") if self.cfg.strategy == 'greedy': if self.big_blank_durations is None: @@ -278,7 +277,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int): ), preserve_alignments=self.preserve_alignments, preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, + confidence_measure_cfg=self.confidence_measure_cfg, ) else: self.decoding = greedy_decode.GreedyTDTInfer( @@ -292,7 +291,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int): ), preserve_alignments=self.preserve_alignments, preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, + confidence_measure_cfg=self.confidence_measure_cfg, ) else: self.decoding = greedy_decode.GreedyMultiblankRNNTInfer( @@ -305,7 +304,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int): ), preserve_alignments=self.preserve_alignments, preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, + confidence_measure_cfg=self.confidence_measure_cfg, ) elif self.cfg.strategy == 'greedy_batch': @@ -321,7 +320,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int): ), preserve_alignments=self.preserve_alignments, preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, + confidence_measure_cfg=self.confidence_measure_cfg, ) else: self.decoding = greedy_decode.GreedyBatchedTDTInfer( @@ -335,7 +334,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int): ), preserve_alignments=self.preserve_alignments, preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, + confidence_measure_cfg=self.confidence_measure_cfg, ) else: @@ -349,7 +348,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int): ), preserve_alignments=self.preserve_alignments, preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, + confidence_measure_cfg=self.confidence_measure_cfg, ) elif self.cfg.strategy == 'beam': @@ -1006,32 +1005,33 @@ class RNNTDecoding(AbstractRNNTDecoding): from the `token_confidence`. aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. Valid options are `mean`, `min`, `max`, `prod`. - method_cfg: A dict-like object which contains the method name and settings to compute per-frame + measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame confidence scores. - name: The method name (str). + name: The measure name (str). Supported values: - 'max_prob' for using the maximum token probability as a confidence. - 'entropy' for using a normalized entropy of a log-likelihood vector. entropy_type: Which type of entropy to use (str). - Used if confidence_method_cfg.name is set to `entropy`. + Used if confidence_measure_cfg.name is set to `entropy`. Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renui' for the Rényi entropy. + - 'renyi' for the Rényi entropy. Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the temperature equals one, scaling is not applied to 'max_prob', + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) entropy_norm: A mapping of the entropy value to the interval [0,1]. @@ -1047,7 +1047,7 @@ class RNNTDecoding(AbstractRNNTDecoding): preserve_frame_confidence: Same as above, overrides above value. - confidence_method: Same as above, overrides confidence_cfg.method. + confidence_measure_cfg: Same as above, overrides confidence_cfg.measure_cfg. "beam": beam_size: int, defining the beam size for beam search. Must be >= 1. diff --git a/nemo/collections/asr/metrics/rnnt_wer_bpe.py b/nemo/collections/asr/metrics/rnnt_wer_bpe.py index d2e2c3cc5923..3fb50d2a1ee2 100644 --- a/nemo/collections/asr/metrics/rnnt_wer_bpe.py +++ b/nemo/collections/asr/metrics/rnnt_wer_bpe.py @@ -100,32 +100,33 @@ class RNNTBPEDecoding(AbstractRNNTDecoding): from the `token_confidence`. aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. Valid options are `mean`, `min`, `max`, `prod`. - method_cfg: A dict-like object which contains the method name and settings to compute per-frame + measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame confidence scores. - name: The method name (str). + name: The measure name (str). Supported values: - 'max_prob' for using the maximum token probability as a confidence. - 'entropy' for using a normalized entropy of a log-likelihood vector. entropy_type: Which type of entropy to use (str). - Used if confidence_method_cfg.name is set to `entropy`. + Used if confidence_measure_cfg.name is set to `entropy`. Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renui' for the Rényi entropy. + - 'renyi' for the Rényi entropy. Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the temperature equals one, scaling is not applied to 'max_prob', + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) entropy_norm: A mapping of the entropy value to the interval [0,1]. @@ -141,7 +142,7 @@ class RNNTBPEDecoding(AbstractRNNTDecoding): preserve_frame_confidence: Same as above, overrides above value. - confidence_method: Same as above, overrides confidence_cfg.method. + confidence_measure_cfg: Same as above, overrides confidence_cfg.measure_cfg. "beam": beam_size: int, defining the beam size for beam search. Must be >= 1. diff --git a/nemo/collections/asr/metrics/wer.py b/nemo/collections/asr/metrics/wer.py index 4d90810cc3df..a88895763edc 100644 --- a/nemo/collections/asr/metrics/wer.py +++ b/nemo/collections/asr/metrics/wer.py @@ -35,14 +35,17 @@ def word_error_rate(hypotheses: List[str], references: List[str], use_cer=False) -> float: """ Computes Average Word Error rate between two texts represented as - corresponding lists of string. Hypotheses and references must have same - length. + corresponding lists of string. + + Hypotheses and references must have same length. + Args: - hypotheses: list of hypotheses - references: list of references - use_cer: bool, set True to enable cer + hypotheses (list): list of hypotheses + references(list) : list of references + use_cer (bool): set True to enable cer + Returns: - (float) average word error rate + wer (float): average word error rate """ scores = 0 words = 0 @@ -78,17 +81,18 @@ def word_error_rate_detail( between two texts represented as corresponding lists of string. Hypotheses and references must have same length. + Args: - hypotheses (list): list of hypotheses - references(list) : list of references - use_cer (bool): set True to enable cer - Returns: - wer (float): average word error rate - words (int): Total number of words/charactors of given reference texts - ins_rate (float): average insertion error rate - del_rate (float): average deletion error rate - sub_rate (float): average substitution error rate + hypotheses (list): list of hypotheses + references(list) : list of references + use_cer (bool): set True to enable cer + Returns: + wer (float): average word error rate + words (int): Total number of words/charactors of given reference texts + ins_rate (float): average insertion error rate + del_rate (float): average deletion error rate + sub_rate (float): average substitution error rate """ scores = 0 words = 0 @@ -141,6 +145,68 @@ def word_error_rate_detail( return wer, words, ins_rate, del_rate, sub_rate +def word_error_rate_per_utt(hypotheses: List[str], references: List[str], use_cer=False) -> Tuple[List[float], float]: + """ + Computes Word Error Rate per utterance and the average WER + between two texts represented as corresponding lists of string. + + Hypotheses and references must have same length. + + Args: + hypotheses (list): list of hypotheses + references(list) : list of references + use_cer (bool): set True to enable cer + + Returns: + wer_per_utt (List[float]): word error rate per utterance + avg_wer (float): average word error rate + """ + scores = 0 + words = 0 + wer_per_utt = [] + + if len(hypotheses) != len(references): + raise ValueError( + "In word error rate calculation, hypotheses and reference" + " lists must have the same number of elements. But I got:" + "{0} and {1} correspondingly".format(len(hypotheses), len(references)) + ) + + for h, r in zip(hypotheses, references): + if use_cer: + h_list = list(h) + r_list = list(r) + else: + h_list = h.split() + r_list = r.split() + + # To get rid of the issue that jiwer does not allow empty string + if len(r_list) == 0: + if len(h_list) != 0: + errors = len(h_list) + wer_per_utt.append(float('inf')) + else: + if use_cer: + measures = jiwer.cer(r, h, return_dict=True) + er = measures['cer'] + else: + measures = jiwer.compute_measures(r, h) + er = measures['wer'] + + errors = measures['insertions'] + measures['deletions'] + measures['substitutions'] + wer_per_utt.append(er) + + scores += errors + words += len(r_list) + + if words != 0: + avg_wer = 1.0 * scores / words + else: + avg_wer = float('inf') + + return wer_per_utt, avg_wer + + def move_dimension_to_the_front(tensor, dim_index): all_dims = list(range(tensor.ndim)) return tensor.permute(*([dim_index] + all_dims[:dim_index] + all_dims[dim_index + 1 :])) @@ -192,32 +258,33 @@ class AbstractCTCDecoding(ConfidenceMixin): from the `token_confidence`. aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. Valid options are `mean`, `min`, `max`, `prod`. - method_cfg: A dict-like object which contains the method name and settings to compute per-frame + measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame confidence scores. - name: The method name (str). + name: The measure name (str). Supported values: - 'max_prob' for using the maximum token probability as a confidence. - 'entropy' for using a normalized entropy of a log-likelihood vector. entropy_type: Which type of entropy to use (str). - Used if confidence_method_cfg.name is set to `entropy`. + Used if confidence_measure_cfg.name is set to `entropy`. Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renui' for the Rényi entropy. + - 'renyi' for the Rényi entropy. Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the temperature equals one, scaling is not applied to 'max_prob', + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) entropy_norm: A mapping of the entropy value to the interval [0,1]. @@ -233,6 +300,7 @@ class AbstractCTCDecoding(ConfidenceMixin): preserve_alignments: Same as above, overrides above value. compute_timestamps: Same as above, overrides above value. preserve_frame_confidence: Same as above, overrides above value. + confidence_measure_cfg: Same as above, overrides confidence_cfg.measure_cfg. "beam": beam_size: int, defining the beam size for beam search. Must be >= 1. @@ -302,6 +370,14 @@ def __init__(self, decoding_cfg, blank_id: int): # initialize confidence-related fields self._init_confidence(self.cfg.get('confidence_cfg', None)) + # Confidence estimation is not implemented for strategies other than `greedy` + if ( + not self.preserve_frame_confidence + and self.cfg.strategy != 'greedy' + and self.cfg.beam.get('preserve_frame_confidence', False) + ): + raise NotImplementedError(f"Confidence calculation is not supported for strategy `{self.cfg.strategy}`") + # we need timestamps to extract non-blank per-frame confidence if self.compute_timestamps is not None: self.compute_timestamps |= self.preserve_frame_confidence @@ -313,7 +389,7 @@ def __init__(self, decoding_cfg, blank_id: int): preserve_alignments=self.preserve_alignments, compute_timestamps=self.compute_timestamps, preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, + confidence_measure_cfg=self.confidence_measure_cfg, ) elif self.cfg.strategy == 'beam': @@ -961,32 +1037,33 @@ class CTCDecoding(AbstractCTCDecoding): from the `token_confidence`. aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. Valid options are `mean`, `min`, `max`, `prod`. - method_cfg: A dict-like object which contains the method name and settings to compute per-frame + measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame confidence scores. - name: The method name (str). + name: The measure name (str). Supported values: - 'max_prob' for using the maximum token probability as a confidence. - 'entropy' for using a normalized entropy of a log-likelihood vector. entropy_type: Which type of entropy to use (str). - Used if confidence_method_cfg.name is set to `entropy`. + Used if confidence_measure_cfg.name is set to `entropy`. Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renui' for the Rényi entropy. + - 'renyi' for the Rényi entropy. Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the temperature equals one, scaling is not applied to 'max_prob', + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) entropy_norm: A mapping of the entropy value to the interval [0,1]. @@ -1002,6 +1079,7 @@ class CTCDecoding(AbstractCTCDecoding): preserve_alignments: Same as above, overrides above value. compute_timestamps: Same as above, overrides above value. preserve_frame_confidence: Same as above, overrides above value. + confidence_measure_cfg: Same as above, overrides confidence_cfg.measure_cfg. "beam": beam_size: int, defining the beam size for beam search. Must be >= 1. diff --git a/nemo/collections/asr/metrics/wer_bpe.py b/nemo/collections/asr/metrics/wer_bpe.py index 8a92e4745a1b..524294d61c50 100644 --- a/nemo/collections/asr/metrics/wer_bpe.py +++ b/nemo/collections/asr/metrics/wer_bpe.py @@ -74,32 +74,33 @@ class CTCBPEDecoding(AbstractCTCDecoding): from the `token_confidence`. aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. Valid options are `mean`, `min`, `max`, `prod`. - method_cfg: A dict-like object which contains the method name and settings to compute per-frame + measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame confidence scores. - name: The method name (str). + name: The measure name (str). Supported values: - 'max_prob' for using the maximum token probability as a confidence. - 'entropy' for using a normalized entropy of a log-likelihood vector. entropy_type: Which type of entropy to use (str). - Used if confidence_method_cfg.name is set to `entropy`. + Used if confidence_measure_cfg.name is set to `entropy`. Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renui' for the Rényi entropy. + - 'renyi' for the Rényi entropy. Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the temperature equals one, scaling is not applied to 'max_prob', + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) entropy_norm: A mapping of the entropy value to the interval [0,1]. @@ -115,6 +116,7 @@ class CTCBPEDecoding(AbstractCTCDecoding): preserve_alignments: Same as above, overrides above value. compute_timestamps: Same as above, overrides above value. preserve_frame_confidence: Same as above, overrides above value. + confidence_measure_cfg: Same as above, overrides confidence_cfg.measure_cfg. "beam": beam_size: int, defining the beam size for beam search. Must be >= 1. diff --git a/nemo/collections/asr/models/confidence_ensemble.py b/nemo/collections/asr/models/confidence_ensemble.py index 9b3191c8874d..bf65ff96ef5c 100644 --- a/nemo/collections/asr/models/confidence_ensemble.py +++ b/nemo/collections/asr/models/confidence_ensemble.py @@ -25,7 +25,7 @@ from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel from nemo.collections.asr.parts.utils.asr_confidence_utils import ( ConfidenceConfig, - ConfidenceMethodConfig, + ConfidenceMeasureConfig, get_confidence_aggregation_bank, get_confidence_measure_bank, ) @@ -61,8 +61,8 @@ def to_confidence_config(self) -> ConfidenceConfig: return ConfidenceConfig( exclude_blank=self.exclude_blank, aggregation=self.aggregation, - method_cfg=ConfidenceMethodConfig( - name=name, entropy_type=entropy_type, temperature=self.alpha, entropy_norm=entropy_norm, + measure_cfg=ConfidenceMeasureConfig( + name=name, entropy_type=entropy_type, alpha=self.alpha, entropy_norm=entropy_norm, ), ) @@ -135,12 +135,12 @@ def compute_confidence(hypothesis: Hypothesis, confidence_cfg: ConfidenceConfig) filtered_logprobs = get_filtered_logprobs(hypothesis, confidence_cfg.exclude_blank) vocab_size = filtered_logprobs.shape[1] aggr_func = get_confidence_aggregation_bank()[confidence_cfg.aggregation] - if confidence_cfg.method_cfg.name == "max_prob": + if confidence_cfg.measure_cfg.name == "max_prob": conf_type = "max_prob" alpha = 1.0 else: - conf_type = f"entropy_{confidence_cfg.method_cfg.entropy_type}_{confidence_cfg.method_cfg.entropy_norm}" - alpha = confidence_cfg.method_cfg.temperature + conf_type = f"entropy_{confidence_cfg.measure_cfg.entropy_type}_{confidence_cfg.measure_cfg.entropy_norm}" + alpha = confidence_cfg.measure_cfg.alpha conf_func = get_confidence_measure_bank()[conf_type] conf_value = aggr_func(conf_func(filtered_logprobs, v=vocab_size, t=alpha)).cpu().item() diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py index a64eded97208..1f29a511fc9c 100644 --- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py @@ -16,12 +16,13 @@ from typing import List, Optional import torch -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf from nemo.collections.asr.parts.utils import rnnt_utils -from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMeasureMixin, ConfidenceMethodConfig +from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMeasureConfig, ConfidenceMeasureMixin from nemo.core.classes import Typing, typecheck from nemo.core.neural_types import HypothesisType, LengthsType, LogprobsType, NeuralType +from nemo.utils import logging def pack_hypotheses(hypotheses: List[rnnt_utils.Hypothesis], logitlen: torch.Tensor,) -> List[rnnt_utils.Hypothesis]: @@ -70,31 +71,32 @@ class GreedyCTCInfer(Typing, ConfidenceMeasureMixin): preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores generated during decoding. When set to true, the Hypothesis will contain the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of floats. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame + confidence_measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame confidence scores. - name: The method name (str). + name: The measure name (str). Supported values: - 'max_prob' for using the maximum token probability as a confidence. - 'entropy' for using a normalized entropy of a log-likelihood vector. - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. + entropy_type: Which type of entropy to use (str). Used if confidence_measure_cfg.name is set to `entropy`. Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renui' for the Rényi entropy. + - 'renyi' for the Rényi entropy. Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the temperature equals one, scaling is not applied to 'max_prob', + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) entropy_norm: A mapping of the entropy value to the interval [0,1]. @@ -128,7 +130,7 @@ def __init__( preserve_alignments: bool = False, compute_timestamps: bool = False, preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, + confidence_measure_cfg: Optional[DictConfig] = None, ): super().__init__() @@ -138,8 +140,8 @@ def __init__( self.compute_timestamps = compute_timestamps | preserve_frame_confidence self.preserve_frame_confidence = preserve_frame_confidence - # set confidence calculation method - self._init_confidence_measure(confidence_method_cfg) + # set confidence calculation measure + self._init_confidence_measure(confidence_measure_cfg) @typecheck() def forward( @@ -251,4 +253,27 @@ class GreedyCTCInferConfig: preserve_alignments: bool = False compute_timestamps: bool = False preserve_frame_confidence: bool = False - confidence_method_cfg: Optional[ConfidenceMethodConfig] = None + confidence_measure_cfg: Optional[ConfidenceMeasureConfig] = ConfidenceMeasureConfig() + confidence_method_cfg: str = "DEPRECATED" + + def __post_init__(self): + # OmegaConf.structured ensures that post_init check is always executed + self.confidence_measure_cfg = OmegaConf.structured( + self.confidence_measure_cfg + if isinstance(self.confidence_measure_cfg, ConfidenceMeasureConfig) + else ConfidenceMeasureConfig(**self.confidence_measure_cfg) + ) + if self.confidence_method_cfg != "DEPRECATED": + logging.warning( + "`confidence_method_cfg` is deprecated and will be removed in the future. " + "Please use `confidence_measure_cfg` instead." + ) + + # TODO (alaptev): delete the following two lines sometime in the future + logging.warning("Re-writing `confidence_measure_cfg` with the value of `confidence_method_cfg`.") + # OmegaConf.structured ensures that post_init check is always executed + self.confidence_measure_cfg = OmegaConf.structured( + self.confidence_method_cfg + if isinstance(self.confidence_method_cfg, ConfidenceMeasureConfig) + else ConfidenceMeasureConfig(**self.confidence_method_cfg) + ) diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py index ac10e54bb249..dfa3ac27854b 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py @@ -31,11 +31,11 @@ import numpy as np import torch -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf from nemo.collections.asr.modules import rnnt_abstract from nemo.collections.asr.parts.utils import rnnt_utils -from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMeasureMixin, ConfidenceMethodConfig +from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMeasureConfig, ConfidenceMeasureMixin from nemo.collections.common.parts.rnn import label_collate from nemo.core.classes import Typing, typecheck from nemo.core.neural_types import AcousticEncodedRepresentation, ElementType, HypothesisType, LengthsType, NeuralType @@ -96,34 +96,32 @@ class _GreedyRNNTInfer(Typing, ConfidenceMeasureMixin): The length of the list corresponds to the Acoustic Length (T). Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. U is the number of target tokens for the current timestep Ti. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame + confidence_measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame confidence scores. - name: The method name (str). + name: The measure name (str). Supported values: - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using normalized entropy of a log-likelihood vector. + - 'entropy' for using a normalized entropy of a log-likelihood vector. - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. + entropy_type: Which type of entropy to use (str). Used if confidence_measure_cfg.name is set to `entropy`. Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. If the temperature α is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renui' for the Rényi entropy. + - 'renyi' for the Rényi entropy. Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the temperature equals one, scaling is not applied to 'max_prob', + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) entropy_norm: A mapping of the entropy value to the interval [0,1]. @@ -156,7 +154,7 @@ def __init__( max_symbols_per_step: Optional[int] = None, preserve_alignments: bool = False, preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, + confidence_measure_cfg: Optional[DictConfig] = None, ): super().__init__() self.decoder = decoder_model @@ -168,8 +166,8 @@ def __init__( self.preserve_alignments = preserve_alignments self.preserve_frame_confidence = preserve_frame_confidence - # set confidence calculation method - self._init_confidence_measure(confidence_method_cfg) + # set confidence calculation measure + self._init_confidence_measure(confidence_measure_cfg) def __call__(self, *args, **kwargs): return self.forward(*args, **kwargs) @@ -265,31 +263,32 @@ class GreedyRNNTInfer(_GreedyRNNTInfer): The length of the list corresponds to the Acoustic Length (T). Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. U is the number of target tokens for the current timestep Ti. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame + confidence_measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame confidence scores. - name: The method name (str). + name: The measure name (str). Supported values: - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using normalized entropy of a log-likelihood vector. + - 'entropy' for using a normalized entropy of a log-likelihood vector. - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. + entropy_type: Which type of entropy to use (str). Used if confidence_measure_cfg.name is set to `entropy`. Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renui' for the Rényi entropy. + - 'renyi' for the Rényi entropy. Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the temperature equals one, scaling is not applied to 'max_prob', + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) entropy_norm: A mapping of the entropy value to the interval [0,1]. @@ -306,7 +305,7 @@ def __init__( max_symbols_per_step: Optional[int] = None, preserve_alignments: bool = False, preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, + confidence_measure_cfg: Optional[DictConfig] = None, ): super().__init__( decoder_model=decoder_model, @@ -315,7 +314,7 @@ def __init__( max_symbols_per_step=max_symbols_per_step, preserve_alignments=preserve_alignments, preserve_frame_confidence=preserve_frame_confidence, - confidence_method_cfg=confidence_method_cfg, + confidence_measure_cfg=confidence_measure_cfg, ) @typecheck() @@ -503,31 +502,32 @@ class GreedyBatchedRNNTInfer(_GreedyRNNTInfer): The length of the list corresponds to the Acoustic Length (T). Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. U is the number of target tokens for the current timestep Ti. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame + confidence_measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame confidence scores. - name: The method name (str). + name: The measure name (str). Supported values: - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using normalized entropy of a log-likelihood vector. + - 'entropy' for using a normalized entropy of a log-likelihood vector. - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. + entropy_type: Which type of entropy to use (str). Used if confidence_measure_cfg.name is set to `entropy`. Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renui' for the Rényi entropy. + - 'renyi' for the Rényi entropy. Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the temperature equals one, scaling is not applied to 'max_prob', + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) entropy_norm: A mapping of the entropy value to the interval [0,1]. @@ -544,7 +544,7 @@ def __init__( max_symbols_per_step: Optional[int] = None, preserve_alignments: bool = False, preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, + confidence_measure_cfg: Optional[DictConfig] = None, ): super().__init__( decoder_model=decoder_model, @@ -553,7 +553,7 @@ def __init__( max_symbols_per_step=max_symbols_per_step, preserve_alignments=preserve_alignments, preserve_frame_confidence=preserve_frame_confidence, - confidence_method_cfg=confidence_method_cfg, + confidence_measure_cfg=confidence_measure_cfg, ) # Depending on availability of `blank_as_pad` support @@ -1478,29 +1478,34 @@ class GreedyMultiblankRNNTInfer(GreedyRNNTInfer): The length of the list corresponds to the Acoustic Length (T). Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. U is the number of target tokens for the current timestep Ti. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame + confidence_measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame confidence scores. - name: The method name (str). + + name: The measure name (str). Supported values: - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using normalized entropy of a log-likelihood vector. - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. + - 'entropy' for using a normalized entropy of a log-likelihood vector. + + entropy_type: Which type of entropy to use (str). Used if confidence_measure_cfg.name is set to `entropy`. Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renui' for the Rényi entropy. + - 'renyi' for the Rényi entropy. Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the temperature equals one, scaling is not applied to 'max_prob', + + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) + entropy_norm: A mapping of the entropy value to the interval [0,1]. Supported values: - 'lin' for using the linear mapping. @@ -1516,7 +1521,7 @@ def __init__( max_symbols_per_step: Optional[int] = None, preserve_alignments: bool = False, preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, + confidence_measure_cfg: Optional[DictConfig] = None, ): super().__init__( decoder_model=decoder_model, @@ -1525,7 +1530,7 @@ def __init__( max_symbols_per_step=max_symbols_per_step, preserve_alignments=preserve_alignments, preserve_frame_confidence=preserve_frame_confidence, - confidence_method_cfg=confidence_method_cfg, + confidence_measure_cfg=confidence_measure_cfg, ) self.big_blank_durations = big_blank_durations self._SOS = blank_index - len(big_blank_durations) @@ -1677,29 +1682,34 @@ class GreedyBatchedMultiblankRNNTInfer(GreedyBatchedRNNTInfer): The length of the list corresponds to the Acoustic Length (T). Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. U is the number of target tokens for the current timestep Ti. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame + confidence_measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame confidence scores. - name: The method name (str). + + name: The measure name (str). Supported values: - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using normalized entropy of a log-likelihood vector. - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. + - 'entropy' for using a normalized entropy of a log-likelihood vector. + + entropy_type: Which type of entropy to use (str). Used if confidence_measure_cfg.name is set to `entropy`. Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renui' for the Rényi entropy. + - 'renyi' for the Rényi entropy. Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the temperature equals one, scaling is not applied to 'max_prob', + + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) + entropy_norm: A mapping of the entropy value to the interval [0,1]. Supported values: - 'lin' for using the linear mapping. @@ -1715,7 +1725,7 @@ def __init__( max_symbols_per_step: Optional[int] = None, preserve_alignments: bool = False, preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, + confidence_measure_cfg: Optional[DictConfig] = None, ): super().__init__( decoder_model=decoder_model, @@ -1724,7 +1734,7 @@ def __init__( max_symbols_per_step=max_symbols_per_step, preserve_alignments=preserve_alignments, preserve_frame_confidence=preserve_frame_confidence, - confidence_method_cfg=confidence_method_cfg, + confidence_measure_cfg=confidence_measure_cfg, ) self.big_blank_durations = big_blank_durations @@ -2193,7 +2203,31 @@ class GreedyRNNTInferConfig: max_symbols_per_step: Optional[int] = 10 preserve_alignments: bool = False preserve_frame_confidence: bool = False - confidence_method_cfg: Optional[ConfidenceMethodConfig] = None + confidence_measure_cfg: Optional[ConfidenceMeasureConfig] = ConfidenceMeasureConfig() + confidence_method_cfg: str = "DEPRECATED" + + def __post_init__(self): + # OmegaConf.structured ensures that post_init check is always executed + self.confidence_measure_cfg = OmegaConf.structured( + self.confidence_measure_cfg + if isinstance(self.confidence_measure_cfg, ConfidenceMeasureConfig) + else ConfidenceMeasureConfig(**self.confidence_measure_cfg) + ) + if self.confidence_method_cfg != "DEPRECATED": + logging.warning( + "`confidence_method_cfg` is deprecated and will be removed in the future. " + "Please use `confidence_measure_cfg` instead." + ) + + # TODO (alaptev): delete the following two lines sometime in the future + logging.warning("Re-writing `confidence_measure_cfg` with the value of `confidence_method_cfg`.") + # OmegaConf.structured ensures that post_init check is always executed + self.confidence_measure_cfg = OmegaConf.structured( + self.confidence_method_cfg + if isinstance(self.confidence_method_cfg, ConfidenceMeasureConfig) + else ConfidenceMeasureConfig(**self.confidence_method_cfg) + ) + self.confidence_method_cfg = "DEPRECATED" @dataclass @@ -2201,7 +2235,31 @@ class GreedyBatchedRNNTInferConfig: max_symbols_per_step: Optional[int] = 10 preserve_alignments: bool = False preserve_frame_confidence: bool = False - confidence_method_cfg: Optional[ConfidenceMethodConfig] = None + confidence_measure_cfg: Optional[ConfidenceMeasureConfig] = ConfidenceMeasureConfig() + confidence_method_cfg: str = "DEPRECATED" + + def __post_init__(self): + # OmegaConf.structured ensures that post_init check is always executed + self.confidence_measure_cfg = OmegaConf.structured( + self.confidence_measure_cfg + if isinstance(self.confidence_measure_cfg, ConfidenceMeasureConfig) + else ConfidenceMeasureConfig(**self.confidence_measure_cfg) + ) + if self.confidence_method_cfg != "DEPRECATED": + logging.warning( + "`confidence_method_cfg` is deprecated and will be removed in the future. " + "Please use `confidence_measure_cfg` instead." + ) + + # TODO (alaptev): delete the following two lines sometime in the future + logging.warning("Re-writing `confidence_measure_cfg` with the value of `confidence_method_cfg`.") + # OmegaConf.structured ensures that post_init check is always executed + self.confidence_measure_cfg = OmegaConf.structured( + self.confidence_method_cfg + if isinstance(self.confidence_method_cfg, ConfidenceMeasureConfig) + else ConfidenceMeasureConfig(**self.confidence_method_cfg) + ) + self.confidence_method_cfg = "DEPRECATED" class GreedyTDTInfer(_GreedyRNNTInfer): @@ -2230,29 +2288,34 @@ class GreedyTDTInfer(_GreedyRNNTInfer): The length of the list corresponds to the Acoustic Length (T). Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. U is the number of target tokens for the current timestep Ti. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame + confidence_measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame confidence scores. - name: The method name (str). + + name: The measure name (str). Supported values: - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using normalized entropy of a log-likelihood vector. - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. + - 'entropy' for using a normalized entropy of a log-likelihood vector. + + entropy_type: Which type of entropy to use (str). Used if confidence_measure_cfg.name is set to `entropy`. Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renui' for the Rényi entropy. + - 'renyi' for the Rényi entropy. Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the temperature equals one, scaling is not applied to 'max_prob', + + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) + entropy_norm: A mapping of the entropy value to the interval [0,1]. Supported values: - 'lin' for using the linear mapping. @@ -2268,7 +2331,7 @@ def __init__( max_symbols_per_step: Optional[int] = None, preserve_alignments: bool = False, preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, + confidence_measure_cfg: Optional[DictConfig] = None, ): super().__init__( decoder_model=decoder_model, @@ -2277,7 +2340,7 @@ def __init__( max_symbols_per_step=max_symbols_per_step, preserve_alignments=preserve_alignments, preserve_frame_confidence=preserve_frame_confidence, - confidence_method_cfg=confidence_method_cfg, + confidence_measure_cfg=confidence_measure_cfg, ) self.durations = durations @@ -2481,29 +2544,34 @@ class GreedyBatchedTDTInfer(_GreedyRNNTInfer): The length of the list corresponds to the Acoustic Length (T). Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. U is the number of target tokens for the current timestep Ti. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame + confidence_measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame confidence scores. - name: The method name (str). + + name: The measure name (str). Supported values: - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using normalized entropy of a log-likelihood vector. - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. + - 'entropy' for using a normalized entropy of a log-likelihood vector. + + entropy_type: Which type of entropy to use (str). Used if confidence_measure_cfg.name is set to `entropy`. Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the temperature α is provided, + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the temperature should comply the following inequality: - 1/log(V) <= α <= -1/log(1-1/V) where V is the model vocabulary size. + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renui' for the Rényi entropy. + - 'renyi' for the Rényi entropy. Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), where α is a parameter. When α == 1, it works like the Gibbs entropy. More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - temperature: Temperature scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the temperature equals one, scaling is not applied to 'max_prob', + + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) + entropy_norm: A mapping of the entropy value to the interval [0,1]. Supported values: - 'lin' for using the linear mapping. @@ -2519,7 +2587,7 @@ def __init__( max_symbols_per_step: Optional[int] = None, preserve_alignments: bool = False, preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, + confidence_measure_cfg: Optional[DictConfig] = None, ): super().__init__( decoder_model=decoder_model, @@ -2528,7 +2596,7 @@ def __init__( max_symbols_per_step=max_symbols_per_step, preserve_alignments=preserve_alignments, preserve_frame_confidence=preserve_frame_confidence, - confidence_method_cfg=confidence_method_cfg, + confidence_measure_cfg=confidence_measure_cfg, ) self.durations = durations diff --git a/nemo/collections/asr/parts/utils/asr_confidence_benchmarking_utils.py b/nemo/collections/asr/parts/utils/asr_confidence_benchmarking_utils.py new file mode 100644 index 000000000000..958195a4bb11 --- /dev/null +++ b/nemo/collections/asr/parts/utils/asr_confidence_benchmarking_utils.py @@ -0,0 +1,183 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import copy +import os +from pathlib import Path +from typing import List, Optional, Tuple, Union + +import numpy as np +import texterrors +import torch +from omegaconf import open_dict + +from nemo.collections.asr.models import ASRModel, EncDecRNNTModel +from nemo.collections.asr.parts.utils.confidence_metrics import ( + auc_nt, + auc_pr, + auc_roc, + auc_yc, + ece, + nce, + save_confidence_hist, + save_custom_confidence_curve, + save_nt_curve, + save_pr_curve, + save_roc_curve, +) +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis + + +def get_correct_marks(r: Union[List[int], List[str]], h: Union[List[int], List[str]]) -> List[bool]: + """Get correct marks by aligning the reference text with a hypothesis. + + This method considers only insertions and substitutions as incorrect marks. + """ + return [ + a == b + for a, b in zip(*(texterrors.align_texts([str(rr) for rr in r], [str(hh) for hh in h], False)[:-1])) + if b != "" + ] + + +def get_token_targets_with_confidence(hyp: Hypothesis) -> List[Tuple[str, float]]: + return [(y, c) for y, c in zip(hyp.y_sequence, hyp.token_confidence)] + + +def get_word_targets_with_confidence(hyp: Hypothesis) -> List[Tuple[str, float]]: + return [(y, c) for y, c in zip(hyp.words, hyp.word_confidence)] + + +def run_confidence_benchmark( + model: ASRModel, + target_level: str, + filepaths: List[str], + reference_texts: List[str], + batch_size: int = 8, + num_workers: int = 4, + plot_dir: Optional[Union[str, Path]] = None, + autocast: Optional = None, +): + """Run benchmark and plot histograms and curves, if plot_dir is provided. + + Returns: + Dictionary with benchmark results of the following scheme: + `level: (auc_roc, auc_pr, auc_nt, nce, ece, auc_yc, std_yc, max_yc)` with `level` being 'token' or 'word'. + """ + draw_plot = plot_dir is not None + if isinstance(plot_dir, str): + plot_dir = Path(plot_dir) + is_rnnt = isinstance(model, EncDecRNNTModel) + + # setup autocast if necessary + if autocast is None: + + @contextlib.contextmanager + def autocast(): + yield + + # transcribe audio + with autocast(): + with torch.no_grad(): + transcriptions = model.transcribe( + paths2audio_files=filepaths, batch_size=batch_size, return_hypotheses=True, num_workers=num_workers + ) + if is_rnnt: + transcriptions = transcriptions[0] + + levels = [] + if target_level != "word": + levels.append("token") + if target_level != "token": + levels.append("word") + results = {} + for level in levels: + if level == "token": + targets_with_confidence = [get_token_targets_with_confidence(tran) for tran in transcriptions] + correct_marks = [ + get_correct_marks(model.tokenizer.text_to_ids(r), model.tokenizer.text_to_ids(h.text)) + for r, h in zip(reference_texts, transcriptions) + ] + else: # "word" + targets_with_confidence = [get_word_targets_with_confidence(tran) for tran in transcriptions] + correct_marks = [get_correct_marks(r.split(), h.words) for r, h in zip(reference_texts, transcriptions)] + + y_true, y_score = np.array( + [[f, p[1]] for cm, twc in zip(correct_marks, targets_with_confidence) for f, p in zip(cm, twc)] + ).T + # output scheme: yc.mean(), yc.max(), yc.std() or yc.mean(), yc.max(), yc.std(), (thresholds, yc) + result_yc = auc_yc(y_true, y_score, return_std_maximum=True, return_curve=draw_plot) + # output scheme: ece or ece, (thresholds, ece_curve) + results_ece = ece(y_true, y_score, return_curve=draw_plot) + results[level] = [ + auc_roc(y_true, y_score), + auc_pr(y_true, y_score), + auc_nt(y_true, y_score), + nce(y_true, y_score), + results_ece if isinstance(results_ece, float) else results_ece[0], + ] + list(result_yc[:3]) + + if draw_plot: + os.makedirs(plot_dir, exist_ok=True) + + mask_correct = y_true == 1 + y_score_correct = y_score[mask_correct] + y_score_incorrect = y_score[~mask_correct] + # histogram of the correct distribution + save_confidence_hist(y_score_correct, plot_dir, level + "_" + "hist_correct") + # histogram of the incorrect distribution + save_confidence_hist(y_score_incorrect, plot_dir, level + "_" + "hist_incorrect") + # AUC-ROC curve + save_roc_curve(y_true, y_score, plot_dir, level + "_" + "roc") + # AUC-PR curve + save_pr_curve(y_true, y_score, plot_dir, level + "_" + "pr") + # AUC-NT curve + save_nt_curve(y_true, y_score, plot_dir, level + "_" + "nt") + # AUC-YC curve + yc_thresholds, yc_values = result_yc[-1] + save_custom_confidence_curve( + yc_thresholds, + yc_values, + plot_dir, + level + "_" + "yc", + "Threshold", + "True positive rate − False Positive Rate", + ) + # ECE curve + ece_thresholds, ece_values = results_ece[-1] + ece_values /= max(ece_values) + save_custom_confidence_curve( + ece_thresholds, ece_values, plot_dir, level + "_" + "ece", "Threshold", "|Accuracy − Confidence score|" + ) + + return results + + +def apply_confidence_parameters(decoding_cfg, hp): + """Apply parameters from a parameter grid to a decoding config. + + Returns: + Updated decoding config. + """ + new_decoding_cfg = copy.deepcopy(decoding_cfg) + confidence_cfg_fields = ("aggregation", "exclude_blank") + confidence_measure_cfg_fields = ("name", "alpha", "entropy_type", "entropy_norm") + with open_dict(new_decoding_cfg): + for p, v in hp.items(): + if p in confidence_cfg_fields: + new_decoding_cfg.confidence_cfg[p] = v + elif p in confidence_measure_cfg_fields: + new_decoding_cfg.confidence_cfg.measure_cfg[p] = v + return new_decoding_cfg diff --git a/nemo/collections/asr/parts/utils/asr_confidence_utils.py b/nemo/collections/asr/parts/utils/asr_confidence_utils.py index 1387f6940b38..29c49529a509 100644 --- a/nemo/collections/asr/parts/utils/asr_confidence_utils.py +++ b/nemo/collections/asr/parts/utils/asr_confidence_utils.py @@ -18,46 +18,197 @@ from functools import partial from typing import List, Optional +import torch from omegaconf import DictConfig, OmegaConf from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis +from nemo.utils import logging + + +class ConfidenceMeasureConstants: + NAMES = ("max_prob", "entropy") + ENTROPY_TYPES = ("gibbs", "tsallis", "renyi") + ENTROPY_NORMS = ("lin", "exp") + + @classmethod + def print(cls): + return ( + cls.__name__ + + ": " + + str({"NAMES": cls.NAMES, "ENTROPY_TYPES": cls.ENTROPY_TYPES, "ENTROPY_NORMS": cls.ENTROPY_NORMS}) + ) + + +class ConfidenceConstants: + AGGREGATIONS = ("mean", "min", "max", "prod") + + @classmethod + def print(cls): + return cls.__name__ + ": " + str({"AGGREGATIONS": cls.AGGREGATIONS}) @dataclass -class ConfidenceMethodConfig: +class ConfidenceMeasureConfig: + """A Config which contains the measure name and settings to compute per-frame confidence scores. + + Args: + name: The measure name (str). + Supported values: + - 'max_prob' for using the maximum token probability as a confidence. + - 'entropy' for using a normalized entropy of a log-likelihood vector. + + entropy_type: Which type of entropy to use (str). + Used if confidence_measure_cfg.name is set to `entropy`. + Supported values: + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, + the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. + - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. + Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), + where α is a parameter. When α == 1, it works like the Gibbs entropy. + More: https://en.wikipedia.org/wiki/Tsallis_entropy + - 'renyi' for the Rényi entropy. + Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), + where α is a parameter. When α == 1, it works like the Gibbs entropy. + More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy + + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', + and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) + + entropy_norm: A mapping of the entropy value to the interval [0,1]. + Supported values: + - 'lin' for using the linear mapping. + - 'exp' for using exponential mapping with linear shift. + """ + name: str = "entropy" entropy_type: str = "tsallis" - temperature: float = 0.33 + alpha: float = 0.33 entropy_norm: str = "exp" + temperature: str = "DEPRECATED" def __post_init__(self): - if self.name not in ("max_prob", "entropy"): - raise ValueError(f"`name` has to be one of the following: `max_prob`, `entropy`. Provided: {self.name}") - if self.entropy_type not in ("gibbs", "tsallis", "renui"): + if self.temperature != "DEPRECATED": + logging.warning( + "`temperature` is deprecated and will be removed in the future. Please use `alpha` instead." + ) + + # TODO (alaptev): delete the following two lines sometime in the future + logging.warning("Re-writing `alpha` with the value of `temperature`.") + # self.temperature has type str + self.alpha = float(self.temperature) + self.temperature = "DEPRECATED" + if self.name not in ConfidenceMeasureConstants.NAMES: raise ValueError( - f"`entropy_type` has to be one of the following: `gibbs`, `tsallis`, `renui`. Provided: {self.entropy_type}" + f"`name` must be one of the following: " + f"{'`' + '`, `'.join(ConfidenceMeasureConstants.NAMES) + '`'}. Provided: `{self.name}`" ) - if self.temperature <= 0.0: - raise ValueError(f"`temperature` has to be > 0. Provided: {self.temperature}") - if self.entropy_norm not in ("lin", "exp"): + if self.entropy_type not in ConfidenceMeasureConstants.ENTROPY_TYPES: raise ValueError( - f"`entropy_norm` has to be one of the following: `lin`, `exp`. Provided: {self.entropy_norm}" + f"`entropy_type` must be one of the following: " + f"{'`' + '`, `'.join(ConfidenceMeasureConstants.ENTROPY_TYPES) + '`'}. Provided: `{self.entropy_type}`" + ) + if self.alpha <= 0.0: + raise ValueError(f"`alpha` must be > 0. Provided: {self.alpha}") + if self.entropy_norm not in ConfidenceMeasureConstants.ENTROPY_NORMS: + raise ValueError( + f"`entropy_norm` must be one of the following: " + f"{'`' + '`, `'.join(ConfidenceMeasureConstants.ENTROPY_NORMS) + '`'}. Provided: `{self.entropy_norm}`" ) @dataclass class ConfidenceConfig: + """A config which contains the following key-value pairs related to confidence scores. + + Args: + preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores + generated during decoding. When set to true, the Hypothesis will contain + the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of floats. + preserve_token_confidence: Bool flag which preserves the history of per-token confidence scores + generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain + the non-null value for `token_confidence` in it. Here, `token_confidence` is a List of floats. + + The length of the list corresponds to the number of recognized tokens. + preserve_word_confidence: Bool flag which preserves the history of per-word confidence scores + generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain + the non-null value for `word_confidence` in it. Here, `word_confidence` is a List of floats. + + The length of the list corresponds to the number of recognized words. + exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded + from the `token_confidence`. + aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. + Valid options are `mean`, `min`, `max`, `prod`. + measure_cfg: A dict-like object which contains the measure name and settings to compute per-frame + confidence scores. + + name: The measure name (str). + Supported values: + - 'max_prob' for using the maximum token probability as a confidence. + - 'entropy' for using a normalized entropy of a log-likelihood vector. + + entropy_type: Which type of entropy to use (str). Used if confidence_measure_cfg.name is set to `entropy`. + Supported values: + - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, + the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). + Note that for this entropy, the alpha should comply the following inequality: + (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) + where V is the model vocabulary size. + - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. + Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), + where α is a parameter. When α == 1, it works like the Gibbs entropy. + More: https://en.wikipedia.org/wiki/Tsallis_entropy + - 'renyi' for the Rényi entropy. + Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), + where α is a parameter. When α == 1, it works like the Gibbs entropy. + More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy + + alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. + When the alpha equals one, scaling is not applied to 'max_prob', + and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) + + entropy_norm: A mapping of the entropy value to the interval [0,1]. + Supported values: + - 'lin' for using the linear mapping. + - 'exp' for using exponential mapping with linear shift. + """ + preserve_frame_confidence: bool = False preserve_token_confidence: bool = False preserve_word_confidence: bool = False exclude_blank: bool = True aggregation: str = "min" - method_cfg: ConfidenceMethodConfig = ConfidenceMethodConfig() + measure_cfg: ConfidenceMeasureConfig = ConfidenceMeasureConfig() + method_cfg: str = "DEPRECATED" def __post_init__(self): - if self.aggregation not in ("mean", "min", "max", "prod"): + # OmegaConf.structured ensures that post_init check is always executed + self.measure_cfg = OmegaConf.structured( + self.measure_cfg + if isinstance(self.measure_cfg, ConfidenceMeasureConfig) + else ConfidenceMeasureConfig(**self.measure_cfg) + ) + if self.method_cfg != "DEPRECATED": + logging.warning( + "`method_cfg` is deprecated and will be removed in the future. Please use `measure_cfg` instead." + ) + + # TODO (alaptev): delete the following two lines sometime in the future + logging.warning("Re-writing `measure_cfg` with the value of `method_cfg`.") + # OmegaConf.structured ensures that post_init check is always executed + self.measure_cfg = OmegaConf.structured( + self.method_cfg + if isinstance(self.method_cfg, ConfidenceMeasureConfig) + else ConfidenceMeasureConfig(**self.method_cfg) + ) + self.method_cfg = "DEPRECATED" + if self.aggregation not in ConfidenceConstants.AGGREGATIONS: raise ValueError( - f"`aggregation` has to be one of the following: `mean`, `min`, `max`, `prod`. Provided: {self.aggregation}" + f"`aggregation` has to be one of the following: " + f"{'`' + '`, `'.join(ConfidenceMeasureConstants.AGGREGATIONS) + '`'}. Provided: `{self.aggregation}`" ) @@ -70,32 +221,32 @@ def get_confidence_measure_bank(): entropy_gibbs_exp: Gibbs entropy with exponential normalization entropy_tsallis_lin: Tsallis entropy with linear normalization entropy_tsallis_exp: Tsallis entropy with exponential normalization - entropy_renui_lin: Rényi entropy with linear normalization - entropy_renui_exp: Rényi entropy with exponential normalization + entropy_renyi_lin: Rényi entropy with linear normalization + entropy_renyi_exp: Rényi entropy with exponential normalization Returns: dictionary with lambda functions. """ # helper functions - # Gibbs entropy is implemented without temperature + # Gibbs entropy is implemented without alpha neg_entropy_gibbs = lambda x: (x.exp() * x).sum(-1) - neg_entropy_temperature = lambda x, t: (x * t).exp().sum(-1) - neg_entropy_temperature_gibbs = lambda x, t: ((x * t).exp() * x).sum(-1) + neg_entropy_alpha = lambda x, t: (x * t).exp().sum(-1) + neg_entropy_alpha_gibbs = lambda x, t: ((x * t).exp() * x).sum(-1) # too big for a lambda def entropy_tsallis_exp(x, v, t): exp_neg_max_ent = math.exp((1 - math.pow(v, 1 - t)) / (1 - t)) - return (((1 - neg_entropy_temperature(x, t)) / (1 - t)).exp() - exp_neg_max_ent) / (1 - exp_neg_max_ent) + return (((1 - neg_entropy_alpha(x, t)) / (1 - t)).exp() - exp_neg_max_ent) / (1 - exp_neg_max_ent) def entropy_gibbs_exp(x, v, t): exp_neg_max_ent = math.pow(v, -t * math.pow(v, 1 - t)) - return ((neg_entropy_temperature_gibbs(x, t) * t).exp() - exp_neg_max_ent) / (1 - exp_neg_max_ent) + return ((neg_entropy_alpha_gibbs(x, t) * t).exp() - exp_neg_max_ent) / (1 - exp_neg_max_ent) # use Gibbs entropies for Tsallis and Rényi with t == 1.0 entropy_gibbs_lin_baseline = lambda x, v: 1 + neg_entropy_gibbs(x) / math.log(v) entropy_gibbs_exp_baseline = lambda x, v: (neg_entropy_gibbs(x).exp() * v - 1) / (v - 1) # fill the measure bank confidence_measure_bank = {} - # Maximum probability measure is implemented without temperature + # Maximum probability measure is implemented without alpha confidence_measure_bank["max_prob"] = ( lambda x, v, t: (x.max(dim=-1)[0].exp() * v - 1) / (v - 1) if t == 1.0 @@ -104,7 +255,7 @@ def entropy_gibbs_exp(x, v, t): confidence_measure_bank["entropy_gibbs_lin"] = ( lambda x, v, t: entropy_gibbs_lin_baseline(x, v) if t == 1.0 - else 1 + neg_entropy_temperature_gibbs(x, t) / math.log(v) / math.pow(v, 1 - t) + else 1 + neg_entropy_alpha_gibbs(x, t) / math.log(v) / math.pow(v, 1 - t) ) confidence_measure_bank["entropy_gibbs_exp"] = ( lambda x, v, t: entropy_gibbs_exp_baseline(x, v) if t == 1.0 else entropy_gibbs_exp(x, v, t) @@ -112,20 +263,20 @@ def entropy_gibbs_exp(x, v, t): confidence_measure_bank["entropy_tsallis_lin"] = ( lambda x, v, t: entropy_gibbs_lin_baseline(x, v) if t == 1.0 - else 1 + (1 - neg_entropy_temperature(x, t)) / (math.pow(v, 1 - t) - 1) + else 1 + (1 - neg_entropy_alpha(x, t)) / (math.pow(v, 1 - t) - 1) ) confidence_measure_bank["entropy_tsallis_exp"] = ( lambda x, v, t: entropy_gibbs_exp_baseline(x, v) if t == 1.0 else entropy_tsallis_exp(x, v, t) ) - confidence_measure_bank["entropy_renui_lin"] = ( + confidence_measure_bank["entropy_renyi_lin"] = ( lambda x, v, t: entropy_gibbs_lin_baseline(x, v) if t == 1.0 - else 1 + neg_entropy_temperature(x, t).log2() / (t - 1) / math.log(v, 2) + else 1 + neg_entropy_alpha(x, t).log2() / (t - 1) / math.log(v, 2) ) - confidence_measure_bank["entropy_renui_exp"] = ( + confidence_measure_bank["entropy_renyi_exp"] = ( lambda x, v, t: entropy_gibbs_exp_baseline(x, v) if t == 1.0 - else (neg_entropy_temperature(x, t).pow(1 / (t - 1)) * v - 1) / (v - 1) + else (neg_entropy_alpha(x, t).pow(1 / (t - 1)) * v - 1) / (v - 1) ) return confidence_measure_bank @@ -160,48 +311,55 @@ class ConfidenceMeasureMixin(ABC): It initializes per-frame confidence measure. """ - def _init_confidence_measure(self, confidence_method_cfg: Optional[DictConfig] = None): + def _init_confidence_measure(self, confidence_measure_cfg: Optional[DictConfig] = None): """Initialize per-frame confidence measure from config. """ - if confidence_method_cfg is None: - confidence_method_cfg = OmegaConf.structured(ConfidenceMethodConfig()) + # OmegaConf.structured ensures that post_init check is always executed + confidence_measure_cfg = OmegaConf.structured( + ConfidenceMeasureConfig() + if confidence_measure_cfg is None + else ConfidenceMeasureConfig(**confidence_measure_cfg) + ) - # set confidence calculation method + # set confidence calculation measure # we suppose that self.blank_id == len(vocabulary) self.num_tokens = (self.blank_id if hasattr(self, "blank_id") else self._blank_index) + 1 - self.temperature = confidence_method_cfg.temperature + self.alpha = confidence_measure_cfg.alpha # init confidence measure bank self.confidence_measure_bank = get_confidence_measure_bank() - method = None + measure = None # construct measure_name measure_name = "" - if confidence_method_cfg.name == "max_prob": + if confidence_measure_cfg.name == "max_prob": measure_name = "max_prob" - elif confidence_method_cfg.name == "entropy": + elif confidence_measure_cfg.name == "entropy": measure_name = '_'.join( - [confidence_method_cfg.name, confidence_method_cfg.entropy_type, confidence_method_cfg.entropy_norm] + [confidence_measure_cfg.name, confidence_measure_cfg.entropy_type, confidence_measure_cfg.entropy_norm] ) else: - raise ValueError(f"Unsupported `confidence_method_cfg.name`: `{confidence_method_cfg.name}`") + raise ValueError(f"Unsupported `confidence_measure_cfg.name`: `{confidence_measure_cfg.name}`") if measure_name not in self.confidence_measure_bank: raise ValueError(f"Unsupported measure setup: `{measure_name}`") - method = partial(self.confidence_measure_bank[measure_name], v=self.num_tokens, t=self.temperature) - self._get_confidence = lambda x: method(x).tolist() + measure = partial(self.confidence_measure_bank[measure_name], v=self.num_tokens, t=self.alpha) + self._get_confidence = lambda x: measure(torch.nan_to_num(x)).tolist() class ConfidenceMixin(ABC): """Confidence Mixin class. - It initializes per-frame confidence measure. + It is responsible for confidence estimation method initialization and high-level confidence score calculation. """ def _init_confidence(self, confidence_cfg: Optional[DictConfig] = None): """Initialize confidence-related fields and confidence aggregation function from config. """ - if confidence_cfg is None: - confidence_cfg = OmegaConf.structured(ConfidenceConfig()) + # OmegaConf.structured ensures that post_init check is always executed + confidence_cfg = OmegaConf.structured( + ConfidenceConfig() if confidence_cfg is None else ConfidenceConfig(**confidence_cfg) + ) + self.confidence_measure_cfg = confidence_cfg.measure_cfg # extract the config self.preserve_word_confidence = confidence_cfg.get('preserve_word_confidence', False) @@ -216,7 +374,6 @@ def _init_confidence(self, confidence_cfg: Optional[DictConfig] = None): ) self.exclude_blank_from_confidence = confidence_cfg.get('exclude_blank', True) self.word_confidence_aggregation = confidence_cfg.get('aggregation', "min") - self.confidence_method_cfg = confidence_cfg.get('method_cfg', None) # define aggregation functions self.confidence_aggregation_bank = get_confidence_aggregation_bank() @@ -226,7 +383,13 @@ def _init_confidence(self, confidence_cfg: Optional[DictConfig] = None): if self.preserve_frame_confidence is False: if self.cfg.strategy in ['greedy', 'greedy_batch']: self.preserve_frame_confidence = self.cfg.greedy.get('preserve_frame_confidence', False) - self.confidence_method_cfg = self.cfg.greedy.get('confidence_method_cfg', None) + # OmegaConf.structured ensures that post_init check is always executed + confidence_measure_cfg = OmegaConf.structured(self.cfg.greedy).get('confidence_measure_cfg', None) + self.confidence_measure_cfg = ( + OmegaConf.structured(ConfidenceMeasureConfig()) + if confidence_measure_cfg is None + else OmegaConf.structured(ConfidenceMeasureConfig(**confidence_measure_cfg)) + ) @abstractmethod def compute_confidence(self, hypotheses_list: List[Hypothesis]) -> List[Hypothesis]: diff --git a/nemo/collections/asr/parts/utils/confidence_metrics.py b/nemo/collections/asr/parts/utils/confidence_metrics.py index 28aa49959041..7d793c9df607 100644 --- a/nemo/collections/asr/parts/utils/confidence_metrics.py +++ b/nemo/collections/asr/parts/utils/confidence_metrics.py @@ -13,47 +13,94 @@ # limitations under the License. import math +import os +from pathlib import Path +from typing import List, Optional, Tuple, Union +import matplotlib.pyplot as plt import numpy as np -from sklearn.metrics import average_precision_score, log_loss, roc_auc_score +from sklearn.metrics import ( + PrecisionRecallDisplay, + RocCurveDisplay, + average_precision_score, + log_loss, + precision_recall_curve, + roc_auc_score, + roc_curve, +) -def auc_roc(y_true, y_score): +def auc_roc(y_true: Union[List[int], np.ndarray], y_score: Union[List[float], np.ndarray]) -> float: """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. + + Note: If only one class is present in y_true, 0.5 is returned. """ + y_true = np.array(y_true) + y_score = np.array(y_score) + assert len(y_true) == len(y_score) + assert np.all(y_true >= 0) and np.all(y_true <= 1) + if np.all(y_true == 0) or np.all(y_true == 1): + return 0.5 return roc_auc_score(y_true, y_score) -def auc_pr(y_true, y_score): +def auc_pr(y_true: Union[List[int], np.ndarray], y_score: Union[List[float], np.ndarray]) -> float: """Compute Area Under the Precision-Recall Curve (PR AUC) from prediction scores. + + Note: If only regatives are present in y_true, 0.0 is returned. """ + y_true = np.array(y_true) + y_score = np.array(y_score) + assert len(y_true) == len(y_score) + assert np.all(y_true >= 0) and np.all(y_true <= 1) + if np.all(y_true == 0): + return 0.0 return average_precision_score(y_true, y_score) -def auc_nt(y_true, y_score): +def auc_nt(y_true: Union[List[int], np.ndarray], y_score: Union[List[float], np.ndarray]) -> float: """Compute Area Under the Negative Predictive Value vs. True Negative Rate Curve (NT AUC) from prediction scores. This metric can be thought of as a PR AUC in which errors are treated as positives. + + Note: If only positives are present in y_true, 0.0 is returned. """ y_true = np.array(y_true) y_score = np.array(y_score) + assert len(y_true) == len(y_score) + assert np.all(y_true >= 0) and np.all(y_true <= 1) + if np.all(y_true == 1): + return 0.0 return average_precision_score(1 - y_true, 1 - y_score) -def nce(y_true, y_score): +def nce(y_true: Union[List[int], np.ndarray], y_score: Union[List[float], np.ndarray]) -> float: """Compute Normalized Cross Entropy (NCE) from prediction scores. Also known as the Normalized Mutual Information. NCE measures how close the correct prediction scores are to one and the incorrect prediction scores are to zero. Negative NCE values indicate that the classifier performs worse than the setting all prediction scores as the proportion of correct predictions. + + Note: If only one class is present in y_true, 0.5 is returned. """ - p = sum(y_true) / len(y_true) + y_true = np.array(y_true) + y_score = np.array(y_score) + assert len(y_true) == len(y_score) + assert np.all(y_true >= 0) and np.all(y_true <= 1) + if np.all(y_true == 0) or np.all(y_true == 1): + return -math.inf + p = y_true.mean() eps = 1e-15 Hp = -(math.log(p + eps) * p + math.log(1 - p + eps) * (1 - p)) return (Hp - log_loss(y_true, y_score)) / Hp -def ece(y_true, y_score, n_bins=100): +def ece( + y_true: Union[List[int], np.ndarray], + y_score: Union[List[float], np.ndarray], + n_bins: int = 100, + return_curve: bool = False, +) -> Union[float, Tuple[float, Tuple[List[int], List[float]]]]: """Compute Expected Calibration Error (ECE) from prediction scores. ECE measures how close the correct prediction scores are to one and the incorrect prediction scores are to zero. @@ -61,54 +108,159 @@ def ece(y_true, y_score, n_bins=100): """ y_true = np.array(y_true) y_score = np.array(y_score) + assert len(y_true) == len(y_score) + assert np.all(y_true >= 0) and np.all(y_true <= 1) py = np.array([1 - y_score, y_score]).T acc, conf = np.zeros(n_bins), np.zeros(n_bins) Bm = np.zeros(n_bins) + ece_curve = [] + thresholds = [] for m in range(n_bins): a, b = m / n_bins, (m + 1) / n_bins threshold = (a + b) / 2 + thresholds.append(threshold) py_index = (py.T[1] >= threshold).astype(int) py_value = py[np.arange(len(py_index)), py_index] bin_range = ((py_value > a) & (py_value <= b)).nonzero()[0] Bm[m] = len(bin_range) if Bm[m] > 0: - acc[m] = (py_index[bin_range] == y_true[bin_range]).sum() - conf[m] = py_value[bin_range].sum() - if Bm[m] != 0: - acc[m] /= Bm[m] - conf[m] /= Bm[m] - ece = 0 - for m in range(n_bins): - ece += Bm[m] * np.abs((acc[m] - conf[m])) - return ece / sum(Bm) + acc[m] = (py_index[bin_range] == y_true[bin_range]).sum() / Bm[m] + conf[m] = py_value[bin_range].sum() / Bm[m] + ece_curve.append(Bm[m] * np.abs(acc[m] - conf[m])) + ece = sum(ece_curve) / sum(Bm) + if return_curve: + return ece, (thresholds, ece_curve) + else: + return ece -def auc_yc(y_true, y_score, return_std_maximum=False, return_curve=False, n_bins=100): +def auc_yc( + y_true: Union[List[int], np.ndarray], + y_score: Union[List[float], np.ndarray], + n_bins: int = 100, + return_std_maximum: bool = False, + return_curve: bool = False, +) -> Union[ + float, + Tuple[float, Tuple[List[int], List[float]]], + Tuple[float, float, float], + Tuple[float, float, float, Tuple[List[int], List[float]]], +]: """Compute Area Under the Youden's Curve (YC AUC) from prediction scores. YC AUC represents the rate of the effective threshold range. If return_std_maximum is set to True, std and maximum values of the Youden's Curve are returned with the AUC. + + Note: If only one class is present in y_true, zeroes are returned for every entity. """ y_true = np.array(y_true) y_score = np.array(y_score) + thresholds = np.linspace(0, 1, n_bins + 1) + assert len(y_true) == len(y_score) + assert np.all(y_true >= 0) and np.all(y_true <= 1) + if np.all(y_true == 0) or np.all(y_true == 1): + if return_std_maximum and return_curve: + return 0.0, 0.0, 0.0, (thresholds, np.zeros(len(thresholds))) + elif return_std_maximum: + return 0.0, 0.0, 0.0 + elif return_curve: + return 0.0, (thresholds, np.zeros(len(thresholds))) + else: + return 0.0 mask_correct = y_true == 1 - count_correct = len(mask_correct.nonzero()[0]) - count_incorrect = len(y_true) - count_correct + count_correct = max(len(mask_correct.nonzero()[0]), 1) + count_incorrect = max(len(y_true) - count_correct, 1) y_score_correct = y_score[mask_correct] y_score_incorrect = y_score[~mask_correct] yc = [] - thresholds = [i / n_bins for i in range(0, n_bins + 1)] for threshold in thresholds: - tnr = len((np.array(y_score_incorrect) < threshold).nonzero()[0]) / count_incorrect - fnr = len((np.array(y_score_correct) < threshold).nonzero()[0]) / count_correct - yc.append(tnr - fnr) + tnr = len((y_score_incorrect < threshold).nonzero()[0]) / count_incorrect + fnr = len((y_score_correct < threshold).nonzero()[0]) / count_correct + yc.append(abs(tnr - fnr)) yc = np.array(yc) if return_std_maximum and return_curve: - return yc.mean(), yc.max(), yc.std(), (thresholds, yc) + return yc.mean(), yc.std(), yc.max(), (thresholds, yc) elif return_std_maximum: - return yc.mean(), yc.max(), yc.std() + return yc.mean(), yc.std(), yc.max() elif return_curve: return yc.mean(), (thresholds, yc) else: return yc.mean() + + +def save_confidence_hist(y_score: Union[List[float], np.ndarray], plot_dir: Union[str, Path], name: str = "hist"): + os.makedirs(plot_dir, exist_ok=True) + plt.hist(np.array(y_score), 50, range=(0, 1)) + plt.title(name) + plt.xlabel("Confidence score") + plt.ylabel("Count") + plt.savefig(Path(plot_dir) / Path(name + ".png"), dpi=300) + plt.clf() + + +def save_roc_curve( + y_true: Union[List[int], np.ndarray], + y_score: Union[List[float], np.ndarray], + plot_dir: Union[str, Path], + name: str = "roc", +): + assert len(y_true) == len(y_score) + os.makedirs(plot_dir, exist_ok=True) + fpr, tpr, _ = roc_curve(1 - np.array(y_true), 1 - np.array(y_score)) + RocCurveDisplay(fpr=fpr, tpr=tpr).plot() + plt.title(name) + plt.savefig(Path(plot_dir) / Path(name + ".png"), dpi=300) + plt.clf() + + +def save_pr_curve( + y_true: Union[List[int], np.ndarray], + y_score: Union[List[float], np.ndarray], + plot_dir: Union[str, Path], + name: str = "pr", +): + assert len(y_true) == len(y_score) + os.makedirs(plot_dir, exist_ok=True) + precision, recall, _ = precision_recall_curve(np.array(y_true), np.array(y_score)) + PrecisionRecallDisplay(precision=precision, recall=recall).plot() + plt.title(name) + plt.savefig(Path(plot_dir) / Path(name + ".png"), dpi=300) + plt.clf() + + +def save_nt_curve( + y_true: Union[List[int], np.ndarray], + y_score: Union[List[float], np.ndarray], + plot_dir: Union[str, Path], + name: str = "nt", +): + assert len(y_true) == len(y_score) + os.makedirs(plot_dir, exist_ok=True) + precision, recall, _ = precision_recall_curve(1 - np.array(y_true), 1 - np.array(y_score)) + PrecisionRecallDisplay(precision=precision, recall=recall).plot() + plt.title(name) + plt.savefig(Path(plot_dir) / Path(name + ".png"), dpi=300) + plt.clf() + + +def save_custom_confidence_curve( + thresholds: Union[List[float], np.ndarray], + values: Union[List[float], np.ndarray], + plot_dir: Union[str, Path], + name: str = "my_awesome_curve", + xlabel: Optional[str] = None, + ylabel: Optional[str] = None, +): + assert len(thresholds) == len(values) + os.makedirs(plot_dir, exist_ok=True) + plt.plot(thresholds, values) + plt.xlim([0, 1]) + plt.ylim([0, 1]) + plt.title(name) + if xlabel is not None: + plt.xlabel(xlabel) + if ylabel is not None: + plt.ylabel(ylabel) + plt.savefig(Path(plot_dir) / Path(name + ".png"), dpi=300) + plt.clf() diff --git a/scripts/confidence_ensembles/build_ensemble.py b/scripts/confidence_ensembles/build_ensemble.py index b5685c63aa25..bc32a4f99840 100644 --- a/scripts/confidence_ensembles/build_ensemble.py +++ b/scripts/confidence_ensembles/build_ensemble.py @@ -59,7 +59,7 @@ python build_ensemble.py - tune_confidence_config.confidence_type='[entropy_renui_exp,entropy_tsallis_exp]' # only tune over this set + tune_confidence_config.confidence_type='[entropy_renyi_exp,entropy_tsallis_exp]' # only tune over this set tune_confidence_config.alpha='[0.1,0.5,1.0]' # only tune over this set You can check the dataclasses in this file for the full list of supported @@ -97,7 +97,7 @@ ) from nemo.collections.asr.parts.utils.asr_confidence_utils import ( ConfidenceConfig, - ConfidenceMethodConfig, + ConfidenceMeasureConfig, get_confidence_aggregation_bank, get_confidence_measure_bank, ) @@ -143,8 +143,8 @@ class TuneConfidenceConfig: # not including max prob, as there is always an entropy-based metric # that's better but otherwise including everything confidence_type: Tuple[str] = ( - "entropy_renui_exp", - "entropy_renui_lin", + "entropy_renyi_exp", + "entropy_renyi_lin", "entropy_tsallis_exp", "entropy_tsallis_lin", "entropy_gibbs_lin", @@ -214,14 +214,9 @@ class BuildEnsembleConfig: preserve_frame_confidence=True, exclude_blank=True, aggregation="mean", - method_cfg=ConfidenceMethodConfig( - name="entropy", - entropy_type="renui", - temperature=0.25, # this is not really temperature, but alpha, see https://arxiv.org/abs/2212.08703 - entropy_norm="lin", - ), + measure_cfg=ConfidenceMeasureConfig(name="entropy", entropy_type="renyi", alpha=0.25, entropy_norm="lin",), ) - temperature: float = 1.0 # this is a real temperature that will be applied to logits + temperature: float = 1.0 # this is optional, but can be used to change any aspect of the transcription # config, such as batch size or amp usage. Note that model, data and confidence diff --git a/scripts/confidence_ensembles/ensemble_config.yaml b/scripts/confidence_ensembles/ensemble_config.yaml index 954876a0c3cc..590318ee3b28 100644 --- a/scripts/confidence_ensembles/ensemble_config.yaml +++ b/scripts/confidence_ensembles/ensemble_config.yaml @@ -16,8 +16,8 @@ temperature: 1.0 confidence: exclude_blank: True aggregation: mean - method_cfg: + measure_cfg: name: entropy - entropy_type: renui - temperature: 0.25 # this is not really temperature, but alpha, see https://arxiv.org/abs/2212.08703 + entropy_type: renyi + alpha: 0.25 entropy_norm: lin diff --git a/scripts/speech_recognition/confidence/benchmark_asr_confidence.py b/scripts/speech_recognition/confidence/benchmark_asr_confidence.py index a43e80b2bc3f..8922fe09176d 100644 --- a/scripts/speech_recognition/confidence/benchmark_asr_confidence.py +++ b/scripts/speech_recognition/confidence/benchmark_asr_confidence.py @@ -12,32 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. -import contextlib -import copy import json import os from dataclasses import dataclass, is_dataclass from pathlib import Path from typing import Optional -import matplotlib.pyplot as plt -import numpy as np import pytorch_lightning as pl -import texterrors import torch -from omegaconf import MISSING, OmegaConf, open_dict -from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay, precision_recall_curve, roc_curve +from omegaconf import MISSING, OmegaConf from sklearn.model_selection import ParameterGrid from nemo.collections.asr.metrics.rnnt_wer import RNNTDecodingConfig from nemo.collections.asr.metrics.wer import CTCDecodingConfig -from nemo.collections.asr.models import ASRModel +from nemo.collections.asr.models import ASRModel, EncDecRNNTModel +from nemo.collections.asr.parts.utils.asr_confidence_benchmarking_utils import ( + apply_confidence_parameters, + run_confidence_benchmark, +) from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceConfig -from nemo.collections.asr.parts.utils.confidence_metrics import auc_nt, auc_pr, auc_roc, auc_yc, ece, nce from nemo.core.config import hydra_runner from nemo.utils import logging - """ Get confidence metrics and curve plots for a given model, dataset, and confidence parameters. @@ -74,125 +70,10 @@ amp=True \ target_level="word" \ confidence_cfg.exclude_blank=False \ - 'grid_params="{\"aggregation\": [\"min\", \"prod\"], \"temperature\": [0.33, 0.5]}"' + 'grid_params="{\"aggregation\": [\"min\", \"prod\"], \"alpha\": [0.33, 0.5]}"' """ -def get_correct_marks(r, h): - """Get correct marks by aligning the reference text with a hypothesis. - - This method considers only insertions and substitutions as incorrect marks. - """ - return [ - a == b - for a, b in zip(*(texterrors.align_texts([str(rr) for rr in r], [str(hh) for hh in h], False)[:-1])) - if b != "" - ] - - -def get_token_targets_with_confidence(hyp): - return [[y, c] for y, c in zip(hyp.y_sequence, hyp.token_confidence)] - - -def get_word_targets_with_confidence(hyp): - return [[y, c] for y, c in zip(hyp.words, hyp.word_confidence)] - - -def run_benchmark( - model, batch_size, num_workers, is_rnnt, target_level, filepaths, reference_texts, plot_dir, autocast -): - """Run benchmark and plot histograms and curves. - - Returns: - Dictionary with benchmark results of the following scheme: - `level: (auc_roc, auc_pr, auc_nt, nce, ece, auc_yc, max_yc, std_yc)` with `level` being 'token' or 'word'. - """ - # transcribe audio - with autocast(): - with torch.no_grad(): - transcriptions = model.transcribe( - paths2audio_files=filepaths, batch_size=batch_size, return_hypotheses=True, num_workers=num_workers - ) - if is_rnnt: - transcriptions = transcriptions[0] - - levels = [] - if target_level != "word": - levels.append("token") - if target_level != "token": - levels.append("word") - results = {} - for level in levels: - if level == "token": - targets_with_confidence = [get_token_targets_with_confidence(tran) for tran in transcriptions] - correct_marks = [ - get_correct_marks(model.tokenizer.text_to_ids(r), model.tokenizer.text_to_ids(h.text)) - for r, h in zip(reference_texts, transcriptions) - ] - else: # "word" - targets_with_confidence = [get_word_targets_with_confidence(tran) for tran in transcriptions] - correct_marks = [get_correct_marks(r.split(), h.words) for r, h in zip(reference_texts, transcriptions)] - - y_true, y_score = np.array( - [[f, p[1]] for cm, twc in zip(correct_marks, targets_with_confidence) for f, p in zip(cm, twc)] - ).T - mask_correct = y_true == 1 - y_score_correct = y_score[mask_correct] - y_score_incorrect = y_score[~mask_correct] - result_yc = auc_yc(y_true, y_score, return_std_maximum=True, return_curve=True) - results[level] = [ - auc_roc(y_true, y_score), - auc_pr(y_true, y_score), - auc_nt(y_true, y_score), - nce(y_true, y_score), - ece(y_true, y_score), - ] + list(result_yc[:-1]) - - os.makedirs(plot_dir, exist_ok=True) - plt.hist(np.array(y_score_correct), 50, range=(0, 1)) - plt.savefig(plot_dir / Path(level + "_" + "hist_correct.png"), dpi=300) - plt.clf() - plt.hist(np.array(y_score_incorrect), 50, range=(0, 1)) - plt.savefig(plot_dir / Path(level + "_" + "hist_incorrect.png"), dpi=300) - plt.clf() - fpr, tpr, _ = roc_curve(1 - y_true, 1 - y_score) - RocCurveDisplay(fpr=fpr, tpr=tpr).plot() - plt.savefig(plot_dir / Path(level + "_" + "roc.png"), dpi=300) - plt.clf() - precision, recall, _ = precision_recall_curve(y_true, y_score) - PrecisionRecallDisplay(precision=precision, recall=recall).plot() - plt.savefig(plot_dir / Path(level + "_" + "pr.png"), dpi=300) - plt.clf() - precision, recall, _ = precision_recall_curve(1 - y_true, 1 - y_score) - PrecisionRecallDisplay(precision=precision, recall=recall).plot() - plt.savefig(plot_dir / Path(level + "_" + "nt.png"), dpi=300) - plt.clf() - plt.plot(*result_yc[-1]) - plt.ylim([0, 1]) - plt.savefig(plot_dir / Path(level + "_" + "yc.png"), dpi=300) - plt.clf() - - return results - - -def apply_parameters(decoding_cfg, hp): - """Apply parameters from a parameter grid to a decoding config. - - Returns: - Updated decoding config. - """ - new_decoding_cfg = copy.deepcopy(decoding_cfg) - confidence_cfg_fields = ("aggregation", "exclude_blank") - confidence_method_cfg_fields = ("name", "temperature", "entropy_type", "entropy_norm") - with open_dict(new_decoding_cfg): - for p, v in hp.items(): - if p in confidence_cfg_fields: - new_decoding_cfg.confidence_cfg[p] = v - elif p in confidence_method_cfg_fields: - new_decoding_cfg.confidence_cfg.method_cfg[p] = v - return new_decoding_cfg - - def get_experiment_params(cfg): """Get experiment parameters from a confidence config and generate the experiment name. @@ -202,23 +83,23 @@ def get_experiment_params(cfg): """ blank = "no_blank" if cfg.exclude_blank else "blank" aggregation = cfg.aggregation - method_name = cfg.method_cfg.name - temperature = cfg.method_cfg.temperature + method_name = cfg.measure_cfg.name + alpha = cfg.measure_cfg.alpha if method_name == "entropy": - entropy_type = cfg.method_cfg.entropy_type - entropy_norm = cfg.method_cfg.entropy_norm + entropy_type = cfg.measure_cfg.entropy_type + entropy_norm = cfg.measure_cfg.entropy_norm experiment_param_list = [ aggregation, str(cfg.exclude_blank), method_name, entropy_type, entropy_norm, - str(temperature), + str(alpha), ] - experiment_str = "-".join([aggregation, blank, method_name, entropy_type, entropy_norm, str(temperature)]) + experiment_str = "-".join([aggregation, blank, method_name, entropy_type, entropy_norm, str(alpha)]) else: - experiment_param_list = [aggregation, str(cfg.exclude_blank), method_name, "-", "-", str(temperature)] - experiment_str = "-".join([aggregation, blank, method_name, str(temperature)]) + experiment_param_list = [aggregation, str(cfg.exclude_blank), method_name, "-", "-", str(alpha)] + experiment_str = "-".join([aggregation, blank, method_name, str(alpha)]) return experiment_param_list, experiment_str @@ -294,7 +175,7 @@ def main(cfg: ConfidenceBenchmarkingConfig): asr_model = asr_model.eval() # Check if ctc or rnnt model - is_rnnt = hasattr(asr_model, 'joint') + is_rnnt = isinstance(asr_model, EncDecRNNTModel) # Check that the model has the `change_decoding_strategy` method if not hasattr(asr_model, 'change_decoding_strategy'): @@ -317,14 +198,10 @@ def main(cfg: ConfidenceBenchmarkingConfig): reference_texts.append(item['text']) # setup AMP (optional) + autocast = None if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'): logging.info("AMP enabled!\n") autocast = torch.cuda.amp.autocast - else: - - @contextlib.contextmanager - def autocast(): - yield # do grid-based benchmarking if grid_params is provided, otherwise a regular one work_dir = Path(cfg.output_dir) @@ -338,7 +215,7 @@ def autocast(): "method_name", "entropy_type", "entropy_norm", - "temperature", + "alpha", "target_level", "auc_roc", "auc_pr", @@ -346,8 +223,8 @@ def autocast(): "nce", "ece", "auc_yc", - "max_yc", "std_yc", + "max_yc", ] ) + "\n" @@ -374,17 +251,16 @@ def autocast(): f.flush() for i, hp in enumerate(hp_grid): logging.info(f"Run # {i + 1}, grid: `{hp}`") - asr_model.change_decoding_strategy(apply_parameters(asr_model.cfg.decoding, hp)) + asr_model.change_decoding_strategy(apply_confidence_parameters(asr_model.cfg.decoding, hp)) param_list, experiment_name = get_experiment_params(asr_model.cfg.decoding.confidence_cfg) plot_dir = work_dir / Path(experiment_name) - results = run_benchmark( + results = run_confidence_benchmark( asr_model, - cfg.batch_size, - cfg.num_workers, - is_rnnt, cfg.target_level, filepaths, reference_texts, + cfg.batch_size, + cfg.num_workers, plot_dir, autocast, ) @@ -406,11 +282,10 @@ def autocast(): with open(report_file, "tw", encoding="utf-8") as f: f.write(report_legend) f.flush() - results = run_benchmark( + results = run_confidence_benchmark( asr_model, cfg.batch_size, cfg.num_workers, - is_rnnt, cfg.target_level, filepaths, reference_texts, diff --git a/tests/collections/asr/confidence/test_asr_confidence.py b/tests/collections/asr/confidence/test_asr_confidence.py new file mode 100644 index 000000000000..11b127424908 --- /dev/null +++ b/tests/collections/asr/confidence/test_asr_confidence.py @@ -0,0 +1,144 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import math +import tempfile +from pathlib import Path + +import numpy as np +import pytest +from omegaconf import OmegaConf +from pytorch_lightning import Trainer + +from nemo.collections.asr.metrics.rnnt_wer import RNNTDecodingConfig +from nemo.collections.asr.metrics.wer import CTCDecodingConfig +from nemo.collections.asr.models import ASRModel, EncDecCTCModelBPE, EncDecRNNTBPEModel +from nemo.collections.asr.parts.submodules.ctc_greedy_decoding import GreedyCTCInferConfig +from nemo.collections.asr.parts.submodules.rnnt_greedy_decoding import GreedyRNNTInferConfig +from nemo.collections.asr.parts.utils.asr_confidence_benchmarking_utils import run_confidence_benchmark +from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceConfig + +# both models recognize the test data without errors, thus every metric except ece return default values +ECE_VALUES = {("token", "ctc"): 0.87, ("token", "rnnt"): 0.82, ("word", "ctc"): 0.91, ("word", "rnnt"): 0.88} + +TOL_DEGREE = 2 +TOL = 1 / math.pow(10, TOL_DEGREE) + + +@pytest.fixture(scope="module") +def conformer_ctc_bpe_model(): + model = EncDecCTCModelBPE.from_pretrained(model_name="stt_en_conformer_ctc_small") + model.set_trainer(Trainer(devices=1, accelerator="cpu")) + model = model.eval() + return model + + +@pytest.fixture(scope="module") +def conformer_rnnt_bpe_model(): + model = EncDecRNNTBPEModel.from_pretrained(model_name="stt_en_conformer_transducer_small") + model.set_trainer(Trainer(devices=1, accelerator="cpu")) + model = model.eval() + return model + + +@pytest.mark.with_downloads +@pytest.fixture(scope="module") +# @pytest.fixture +def audio_and_texts(test_data_dir): + # get filenames and reference texts from manifest + filepaths = [] + reference_texts = [] + manifest = Path(test_data_dir) / Path("asr/an4_val.json") + with open(manifest, 'r') as f: + for line in f: + item = json.loads(line) + # alaptev: maybe fix those paths in the manifest? + audio_file = Path(item['audio_filepath'].replace("/data/", "/.data/")) + filepaths.append(str(audio_file.absolute())) + reference_texts.append(item['text']) + return filepaths, reference_texts + + +class TestASRConfidenceBenchmark: + @pytest.mark.integration + @pytest.mark.with_downloads + @pytest.mark.parametrize('model_name', ("ctc", "rnnt")) + @pytest.mark.parametrize('target_level', ("token", "word")) + def test_run_confidence_benchmark( + self, model_name, target_level, audio_and_texts, conformer_ctc_bpe_model, conformer_rnnt_bpe_model + ): + model = conformer_ctc_bpe_model if model_name == "ctc" else conformer_rnnt_bpe_model + assert isinstance(model, ASRModel) + filepaths, reference_texts = audio_and_texts + confidence_cfg = ( + ConfidenceConfig(preserve_token_confidence=True) + if target_level == "token" + else ConfidenceConfig(preserve_word_confidence=True) + ) + model.change_decoding_strategy( + RNNTDecodingConfig(fused_batch_size=-1, strategy="greedy_batch", confidence_cfg=confidence_cfg) + if model_name == "rnnt" + else CTCDecodingConfig(confidence_cfg=confidence_cfg) + ) + with tempfile.TemporaryDirectory() as tmpdir: + assert np.allclose( + np.array( + run_confidence_benchmark(model, target_level, filepaths, reference_texts, plot_dir=tmpdir)[ + target_level + ] + ), + np.array([0.5, 1.0, 0.0, -math.inf, ECE_VALUES[(target_level, model_name)], 0.0, 0.0, 0.0]), + atol=TOL, + ) + + @pytest.mark.integration + @pytest.mark.with_downloads + @pytest.mark.parametrize('model_name', ("ctc", "rnnt")) + @pytest.mark.parametrize('arg', ("method_cfg", "temperature", "all")) + def test_deprecated_config_args(self, model_name, arg, conformer_ctc_bpe_model, conformer_rnnt_bpe_model): + assert ConfidenceConfig().measure_cfg.alpha == 0.33, "default `alpha` is supposed to be 0.33" + model = conformer_ctc_bpe_model if model_name == "ctc" else conformer_rnnt_bpe_model + assert isinstance(model, ASRModel) + if arg == "all": + conf = OmegaConf.create({"temperature": 0.5}) + test_args_main = {"method_cfg": conf} + test_args_greedy = {"confidence_method_cfg": conf} + elif arg == "method_cfg": + conf = OmegaConf.create({"alpha": 0.5}) + test_args_main = {"method_cfg": conf} + test_args_greedy = {"confidence_method_cfg": conf} + elif arg == "temperature": + conf = OmegaConf.create({"temperature": 0.5}) + test_args_main = {"measure_cfg": conf} + test_args_greedy = {"confidence_measure_cfg": conf} + else: + raise NotImplementedError(arg) + confidence_cfg = ConfidenceConfig(preserve_word_confidence=True, **test_args_main) + model.change_decoding_strategy( + RNNTDecodingConfig(fused_batch_size=-1, strategy="greedy", confidence_cfg=confidence_cfg) + if model_name == "rnnt" + else CTCDecodingConfig(confidence_cfg=confidence_cfg) + ) + assert model.cfg.decoding.confidence_cfg.measure_cfg.alpha == 0.5 + model.change_decoding_strategy( + RNNTDecodingConfig( + fused_batch_size=-1, + strategy="greedy", + greedy=GreedyRNNTInferConfig(preserve_frame_confidence=True, **test_args_greedy), + ) + if model_name == "rnnt" + else CTCDecodingConfig(greedy=GreedyCTCInferConfig(preserve_frame_confidence=True, **test_args_greedy)) + ) + assert model.cfg.decoding.greedy.confidence_measure_cfg.alpha == 0.5 diff --git a/tests/collections/asr/confidence/test_asr_confidence_metrics.py b/tests/collections/asr/confidence/test_asr_confidence_metrics.py new file mode 100644 index 000000000000..fde5f322a988 --- /dev/null +++ b/tests/collections/asr/confidence/test_asr_confidence_metrics.py @@ -0,0 +1,115 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import tempfile + +import numpy as np +import pytest +from scipy.stats import uniform + +from nemo.collections.asr.parts.utils.confidence_metrics import ( + auc_nt, + auc_pr, + auc_roc, + auc_yc, + ece, + nce, + save_confidence_hist, + save_custom_confidence_curve, + save_nt_curve, + save_pr_curve, + save_roc_curve, +) + +# set convenient name2metric mapping +name2metric = { + f.__name__: (f, ans) + for f, ans in zip((auc_roc, auc_pr, auc_nt, auc_yc, ece, nce), (0.833, 0.917, 0.833, 0.421, 0.232, 0.403)) +} +# ece does not have a default value +name2metric_all_correct = { + f.__name__: (f, ans) for f, ans in zip((auc_roc, auc_pr, auc_nt, auc_yc, nce), (0.5, 1.0, 0.0, 0.0, -math.inf)) +} +name2metric_all_incorrect = { + f.__name__: (f, ans) for f, ans in zip((auc_roc, auc_pr, auc_nt, auc_yc, nce), (0.5, 0.0, 1.0, 0.0, -math.inf)) +} + +# Initialize data +Y_TRUE = [1, 0, 0, 1, 1] +Y_TRUE_ALL_CORRECT = [1, 1, 1, 1, 1] +Y_TRUE_ALL_INCORRECT = [0, 0, 0, 0, 0] +Y_SCORE = [0.6, 0.7, 0.02, 0.95, 0.8] +Y_TRUE_RANDOM = np.random.choice(2, 1000, p=[0.2, 0.8]) +# probability distribution with mean ~= 0.65 and std ~= 0.25 +Y_SCORE_RANDOM = uniform.rvs(size=1000, loc=0.5, scale=0.5) - 0.5 * np.random.choice(2, 1000, p=[0.8, 0.2]) + +TOL_DEGREE = 3 +TOL = 1 / math.pow(10, TOL_DEGREE) + + +class TestConfidenceMetrics: + @pytest.mark.unit + @pytest.mark.parametrize('metric_name', name2metric.keys()) + def test_metric_main(self, metric_name): + metric, ans = name2metric[metric_name] + + assert round(metric(Y_TRUE, Y_SCORE), TOL_DEGREE) == ans + + @pytest.mark.unit + @pytest.mark.parametrize('metric_name', name2metric_all_correct.keys()) + def test_metric_all_correct(self, metric_name): + metric, ans = name2metric_all_correct[metric_name] + + assert round(metric(Y_TRUE_ALL_CORRECT, Y_SCORE), TOL_DEGREE) == ans + + @pytest.mark.unit + @pytest.mark.parametrize('metric_name', name2metric_all_incorrect.keys()) + def test_metric_all_incorrect(self, metric_name): + metric, ans = name2metric_all_incorrect[metric_name] + + assert round(metric(Y_TRUE_ALL_INCORRECT, Y_SCORE), TOL_DEGREE) == ans + + @pytest.mark.unit + def test_metric_auc_yc_aux(self): + n_bins = 10 + result, result_std, result_max, (thresholds, yc_curve) = auc_yc( + Y_TRUE, Y_SCORE, n_bins=n_bins, return_std_maximum=True, return_curve=True + ) + + assert round(result_std, TOL_DEGREE) == 0.228 + assert round(result_max, TOL_DEGREE) == 0.667 + assert np.allclose(np.array(thresholds), np.array([i / n_bins for i in range(0, n_bins + 1)]), atol=TOL) + assert np.allclose( + np.array(yc_curve), np.array([0.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.167, 0.667, 0.667, 0.333, 0.0]), atol=TOL + ) + + +class TestSaveConfidencePlot: + @pytest.mark.unit + def test_save_confidence_hist(self): + with tempfile.TemporaryDirectory() as tmpdir: + save_confidence_hist(Y_SCORE_RANDOM, tmpdir) + + @pytest.mark.unit + @pytest.mark.parametrize('plot_func', (save_roc_curve, save_pr_curve, save_nt_curve)) + def test_save_simple_confidence_curve(self, plot_func): + with tempfile.TemporaryDirectory() as tmpdir: + plot_func(Y_TRUE_RANDOM, Y_SCORE_RANDOM, tmpdir) + + @pytest.mark.unit + def test_save_custom_confidence_curve(self): + with tempfile.TemporaryDirectory() as tmpdir: + ranges = np.arange(0, 1, 0.01) + save_custom_confidence_curve(ranges, ranges, tmpdir) diff --git a/tests/collections/asr/confidence/test_asr_confidence_primitives.py b/tests/collections/asr/confidence/test_asr_confidence_primitives.py new file mode 100644 index 000000000000..d1111406ca62 --- /dev/null +++ b/tests/collections/asr/confidence/test_asr_confidence_primitives.py @@ -0,0 +1,142 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import pytest +import torch + +from nemo.collections.asr.parts.utils.asr_confidence_utils import ( + get_confidence_aggregation_bank, + get_confidence_measure_bank, +) + +# Initialize probability vectors +VOCAB_SIZES = (100, 1000, 10000) +ONE_VEC_SET, ZERO_VEC_SET, RAND_VEC_SET, OVERFIT_RAND_VEC_SET = {}, {}, {}, {} +for vocab_size in VOCAB_SIZES: + # batch size 2 to test different positions of probability one + ONE_VEC_SET[vocab_size] = torch.nan_to_num( + torch.cat( + [ + torch.tensor([[0] + [float('-inf')] * (vocab_size - 1)]), + torch.tensor([[float('-inf')] * (vocab_size - 3) + [0] + [float('-inf')] * 2]), + ] + ) + ) + ZERO_VEC_SET[vocab_size] = torch.nan_to_num(torch.tensor([[math.log(1 / vocab_size)] * vocab_size] * 2)) + # batch size 1 + rand_logit = torch.rand((1, vocab_size)) + rand_logit_overfit = rand_logit.clone() + rand_logit_overfit[0, 0] += vocab_size + RAND_VEC_SET[vocab_size] = torch.nan_to_num(torch.nn.functional.log_softmax(rand_logit, -1)) + OVERFIT_RAND_VEC_SET[vocab_size] = torch.nan_to_num(torch.nn.functional.log_softmax(rand_logit_overfit, -1)) +AGGREGATION_VEC_SIMPLE = [0.0, 0.5, 1] + +TOL_DEGREE = 6 +TOL = 1 / math.pow(10, TOL_DEGREE) + + +def get_measure_parametrize_ranges(): + confidence_measure_bank = {} + alpha_range = (0.25, 0.5, 1.0) + bank_exception = None + try: + confidence_measure_bank = get_confidence_measure_bank() + except Exception as e: + alpha_range = () + bank_exception = e + return confidence_measure_bank, alpha_range, bank_exception + + +def get_aggregation_parametrize_ranges(): + confidence_aggregation_bank = {} + bank_exception = None + try: + confidence_aggregation_bank = get_confidence_aggregation_bank() + except Exception as e: + bank_exception = e + return confidence_aggregation_bank, bank_exception + + +class TestConfidenceMeasureBank: + measure_bank, alphas, bank_build_exception = get_measure_parametrize_ranges() + + @pytest.mark.unit + def test_measure_bank(self): + if self.bank_build_exception is not None: + raise self.bank_build_exception + + assert isinstance(self.measure_bank, dict) + assert len(self.measure_bank) > 0 + + @pytest.mark.unit + @pytest.mark.parametrize('measure_name', measure_bank.keys()) + @pytest.mark.parametrize('alpha', alphas) + @pytest.mark.parametrize('vocab_size', VOCAB_SIZES) + def test_confidence_measures_one(self, measure_name, alpha, vocab_size): + measure = self.measure_bank[measure_name] + + assert torch.allclose(measure(ONE_VEC_SET[vocab_size], vocab_size, alpha), torch.tensor([1.0, 1.0]), atol=TOL) + + @pytest.mark.unit + @pytest.mark.parametrize('measure_name', measure_bank.keys()) + @pytest.mark.parametrize('alpha', alphas) + @pytest.mark.parametrize('vocab_size', VOCAB_SIZES) + def test_confidence_measures_zero(self, measure_name, alpha, vocab_size): + measure = self.measure_bank[measure_name] + + assert torch.allclose(measure(ZERO_VEC_SET[vocab_size], vocab_size, alpha), torch.tensor([0.0, 0.0]), atol=TOL) + + @pytest.mark.unit + @pytest.mark.parametrize('measure_name', measure_bank.keys()) + @pytest.mark.parametrize('alpha', alphas) + @pytest.mark.parametrize('vocab_size', VOCAB_SIZES) + def test_confidence_measures_partial_order(self, measure_name, alpha, vocab_size): + measure = self.measure_bank[measure_name] + value_normal = round(float(measure(RAND_VEC_SET[vocab_size], vocab_size, alpha)[0]), TOL_DEGREE) + value_overfit = round(float(measure(OVERFIT_RAND_VEC_SET[vocab_size], vocab_size, alpha)[0]), TOL_DEGREE) + + assert 0 <= value_normal < value_overfit <= 1, ( + measure(RAND_VEC_SET[vocab_size], vocab_size, alpha), + measure(OVERFIT_RAND_VEC_SET[vocab_size], vocab_size, alpha), + ) + + +class TestConfidenceAggregationBank: + aggregation_bank, bank_build_exception = get_aggregation_parametrize_ranges() + + @pytest.mark.unit + def test_aggregation_bank(self): + if self.bank_build_exception is not None: + raise self.bank_build_exception + + assert isinstance(self.aggregation_bank, dict) + assert len(self.aggregation_bank) > 0 + + @pytest.mark.unit + @pytest.mark.parametrize('aggregation_name', aggregation_bank.keys()) + def test_confidence_agregation_simple(self, aggregation_name): + # alaptev: would skipif work with parametrize arguments? + if aggregation_name not in ("mean", "min", "max", "prod"): + pytest.skip(f"{aggregation_name} is not a simple aggregation") + aggregation = self.aggregation_bank[aggregation_name] + if aggregation_name == "mean": + assert aggregation(AGGREGATION_VEC_SIMPLE) == 0.5 + elif aggregation_name == "min": + assert aggregation(AGGREGATION_VEC_SIMPLE) == 0.0 + if aggregation_name == "max": + assert aggregation(AGGREGATION_VEC_SIMPLE) == 1.0 + if aggregation_name == "prod": + assert aggregation(AGGREGATION_VEC_SIMPLE) == 0.0 diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py index 22926b6516ee..8687ed683833 100644 --- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py +++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py @@ -242,7 +242,8 @@ def test_decoding_change(self, hybrid_asr_model): @pytest.mark.unit def test_GreedyRNNTInferConfig(self): - IGNORE_ARGS = ['decoder_model', 'joint_model', 'blank_index'] + # confidence_method_cfg is deprecated + IGNORE_ARGS = ['decoder_model', 'joint_model', 'blank_index', 'confidence_method_cfg'] result = assert_dataclass_signature_match( greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyRNNTInferConfig, ignore_args=IGNORE_ARGS @@ -256,7 +257,8 @@ def test_GreedyRNNTInferConfig(self): @pytest.mark.unit def test_GreedyBatchedRNNTInferConfig(self): - IGNORE_ARGS = ['decoder_model', 'joint_model', 'blank_index'] + # confidence_method_cfg is deprecated + IGNORE_ARGS = ['decoder_model', 'joint_model', 'blank_index', 'confidence_method_cfg'] result = assert_dataclass_signature_match( greedy_decode.GreedyBatchedRNNTInfer, greedy_decode.GreedyBatchedRNNTInferConfig, ignore_args=IGNORE_ARGS diff --git a/tests/collections/asr/test_asr_metrics.py b/tests/collections/asr/test_asr_metrics.py index 9a43ed4e2b90..2c4ec0953444 100644 --- a/tests/collections/asr/test_asr_metrics.py +++ b/tests/collections/asr/test_asr_metrics.py @@ -32,6 +32,7 @@ CTCDecodingConfig, word_error_rate, word_error_rate_detail, + word_error_rate_per_utt, ) from nemo.collections.asr.metrics.wer_bpe import WERBPE, CTCBPEDecoding, CTCBPEDecodingConfig from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis @@ -136,6 +137,15 @@ def test_wer_function(self): 0.0, ) + assert word_error_rate_per_utt(hypotheses=['kat'], references=['cat']) == ([1.0], 1.0) + assert word_error_rate_per_utt(hypotheses=['cat', ''], references=['', 'gpu']) == ([float("inf"), 1.0], 2.0) + assert word_error_rate_per_utt( + hypotheses=['ducuti motorcycle', 'G P U'], references=['ducati motorcycle', 'GPU'] + ) == ([0.5, 3.0], 4 / 3) + assert word_error_rate_per_utt( + hypotheses=['ducuti motorcycle', 'G P U'], references=['ducati motorcycle', 'GPU'], use_cer=True + ) == ([1 / 17, 2 / 3], 0.15) + @pytest.mark.unit @pytest.mark.parametrize("batch_dim_index", [0, 1]) @pytest.mark.parametrize("test_wer_bpe", [False, True]) diff --git a/tests/collections/asr/test_asr_rnnt_encdec_model.py b/tests/collections/asr/test_asr_rnnt_encdec_model.py index 68f1e38f797b..775a146c74c4 100644 --- a/tests/collections/asr/test_asr_rnnt_encdec_model.py +++ b/tests/collections/asr/test_asr_rnnt_encdec_model.py @@ -242,7 +242,8 @@ def test_decoding_change(self, asr_model): @pytest.mark.unit def test_GreedyRNNTInferConfig(self): - IGNORE_ARGS = ['decoder_model', 'joint_model', 'blank_index'] + # confidence_method_cfg is deprecated + IGNORE_ARGS = ['decoder_model', 'joint_model', 'blank_index', 'confidence_method_cfg'] result = assert_dataclass_signature_match( greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyRNNTInferConfig, ignore_args=IGNORE_ARGS @@ -256,7 +257,8 @@ def test_GreedyRNNTInferConfig(self): @pytest.mark.unit def test_GreedyBatchedRNNTInferConfig(self): - IGNORE_ARGS = ['decoder_model', 'joint_model', 'blank_index'] + # confidence_method_cfg is deprecated + IGNORE_ARGS = ['decoder_model', 'joint_model', 'blank_index', 'confidence_method_cfg'] result = assert_dataclass_signature_match( greedy_decode.GreedyBatchedRNNTInfer, greedy_decode.GreedyBatchedRNNTInferConfig, ignore_args=IGNORE_ARGS diff --git a/tests/collections/asr/test_confidence_ensembles.py b/tests/collections/asr/test_confidence_ensembles.py index ad14a2a7e6ff..b8b027dd3426 100644 --- a/tests/collections/asr/test_confidence_ensembles.py +++ b/tests/collections/asr/test_confidence_ensembles.py @@ -19,7 +19,7 @@ from nemo.collections.asr.metrics.wer import CTCDecodingConfig from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecRNNTModel from nemo.collections.asr.models.confidence_ensemble import ConfidenceEnsembleModel -from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceConfig, ConfidenceMethodConfig +from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceConfig, ConfidenceMeasureConfig def get_model_config(model_class): @@ -117,12 +117,7 @@ def test_model_creation_2models(self, tmp_path, model_class0, model_class1): preserve_frame_confidence=True, exclude_blank=True, aggregation="mean", - method_cfg=ConfidenceMethodConfig( - name="entropy", - entropy_type="renui", - temperature=0.25, # this is not really temperature, but alpha, see https://arxiv.org/abs/2212.08703 - entropy_norm="lin", - ), + measure_cfg=ConfidenceMeasureConfig(name="entropy", entropy_type="renyi", alpha=0.25, entropy_norm="lin",), ) # just checking that no errors are raised when creating the model @@ -153,12 +148,7 @@ def test_model_creation_5models(self, tmp_path): preserve_frame_confidence=True, exclude_blank=True, aggregation="mean", - method_cfg=ConfidenceMethodConfig( - name="entropy", - entropy_type="renui", - temperature=0.25, # this is not really temperature, but alpha, see https://arxiv.org/abs/2212.08703 - entropy_norm="lin", - ), + measure_cfg=ConfidenceMeasureConfig(name="entropy", entropy_type="renyi", alpha=0.25, entropy_norm="lin",), ) # just checking that no errors are raised when creating the model diff --git a/tutorials/asr/ASR_Confidence_Estimation.ipynb b/tutorials/asr/ASR_Confidence_Estimation.ipynb new file mode 100644 index 000000000000..2a1ad024a889 --- /dev/null +++ b/tutorials/asr/ASR_Confidence_Estimation.ipynb @@ -0,0 +1,1432 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "abe9913d", + "metadata": { + "id": "1a0f93c6" + }, + "outputs": [], + "source": [ + "BRANCH = 'main'\n", + "\n", + "\"\"\"\n", + "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "\n", + "Instructions for setting up Colab are as follows:\n", + "1. Open a new Python 3 notebook.\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", + "4. Run this cell to set up dependencies.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd26974d", + "metadata": { + "id": "ffdfe626" + }, + "outputs": [], + "source": [ + "import os\n", + "# either provide a path to local NeMo repository with NeMo already installed or git clone\n", + "\n", + "# option #1: local path to NeMo repo with NeMo already installed\n", + "NEMO_DIR_PATH = os.path.dirname(os.path.dirname(os.path.abspath('')))\n", + "is_colab = False\n", + "\n", + "# option #2: download NeMo repo\n", + "if 'google.colab' in str(get_ipython()) or not os.path.exists(os.path.join(NEMO_DIR_PATH, \"nemo\")):\n", + " ## Install dependencies\n", + " !apt-get install sox libsndfile1 ffmpeg\n", + "\n", + " !git clone -b $BRANCH https://github.com/NVIDIA/NeMo\n", + " %cd NeMo\n", + " !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", + " NEMO_DIR_PATH = os.path.abspath('')\n", + " is_colab = True\n", + "\n", + "import sys\n", + "sys.path.insert(0, NEMO_DIR_PATH)" + ] + }, + { + "cell_type": "markdown", + "id": "b3f35d50", + "metadata": { + "id": "bcc3e593" + }, + "source": [ + "# 1. Introduction to ASR confidence estimation\n", + "Confidence estimation is a crucial yet sometimes overlooked aspect of automatic speech recognition (ASR) systems. Confidence estimation for ASR is the process of estimating the rate of reliability of the output generated by an ASR system. For an output transcription, confidence estimation answers the question \"how accurate this transcription is\", or \"how likely this transcription is correct\".\n", + "\n", + "Confidence score is the result of confidence estimation. It lies in range from 0 to 1, where zero signals that the confidence estimator is completely unsure, and one indicates that the estimator is confident in the output. Confidence scores are often used to guide downstream processing in ASR applications. For example, in a voice dictation application, a low confidence score could trigger the system to ask the user to repeat the input or to suggest alternative transcriptions.\n", + "\n", + "There are several approaches to confidence estimation in ASR, including:\n", + "\n", + "1. Acoustic modeling-based methods: These methods use the acoustic model scores to estimate the confidence score. The acoustic model represents the relationship between the acoustic signal and the corresponding linguistic units, and the score reflects the similarity between the observed signal and the predicted model output. Here, the acoustic model can be the ASR model itself (non-trainable methods), or a trainable external estimator, accepting acoustic features or output probabilities and predicting confidence scores.\n", + "\n", + "2. Language modeling-based methods: These methods use the language model scores to estimate the confidence score. The language model represents the probability distribution of the sequence of words, and the score reflects the likelihood of the transcription given the language model. \n", + "\n", + "3. Combination methods: These methods combine the scores from both the acoustic and language models to estimate the confidence score. This approach can leverage the strengths of both models to achieve more accurate confidence scores.\n", + "\n", + "In this introductory tutorial we will cover only the non-trainable acoustic-based methods." + ] + }, + { + "cell_type": "markdown", + "id": "34e356bf", + "metadata": { + "id": "59100fb9" + }, + "source": [ + "## 1.1. Optional resources\n", + "This tutorial is self-contained, but if you want to dive deeper into the topic, you can check out these resources:\n", + "* Paper behind this tutorial: https://arxiv.org/abs/2212.08703\n", + "* Supplementary blog on how and why confidence estimation methods of this tutorial were developed: https://developer.nvidia.com/blog/entropy-based-methods-for-word-level-asr-confidence-estimation/" + ] + }, + { + "cell_type": "markdown", + "id": "9739cb35", + "metadata": { + "id": "cd7226c5" + }, + "source": [ + "# 2. Data Download\n", + "First, let's download audio and text data. Here we will use LibriSpeech *dev-other* and *test-other*." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46b2861b", + "metadata": { + "id": "fd542e62" + }, + "outputs": [], + "source": [ + "## create data directory and download an audio file\n", + "WORK_DIR = 'WORK_DIR'\n", + "DATA_DIR = WORK_DIR + '/DATA'\n", + "os.makedirs(DATA_DIR, exist_ok=True)\n", + "\n", + "print('downloading audio data...')\n", + "!python $NEMO_DIR_PATH/scripts/dataset_processing/get_librispeech_data.py --data_root=$DATA_DIR --data_set=test_other\n", + "!rm $DATA_DIR/test_other.tar.gz" + ] + }, + { + "cell_type": "markdown", + "id": "8ba5ad12", + "metadata": { + "id": "383eee71" + }, + "source": [ + "# 3. Confidence estimation example\n", + "Let's see how confidence scores can be obtained with NeMo models." + ] + }, + { + "cell_type": "markdown", + "id": "a95697fe", + "metadata": { + "id": "7c7c0170" + }, + "source": [ + "## 3.1. Helper functions\n", + "The following functions are to pretty-print confidence scores for word-level ASR hypotheses." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bd12b7b", + "metadata": { + "id": "20cf0b38" + }, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "from termcolor import colored\n", + "from typing import List, Optional, Tuple, Union\n", + "\n", + "from IPython.display import Audio, HTML, Image, display\n", + "import numpy as np\n", + "import texterrors\n", + "\n", + "def get_detailed_wer_labels(ref: List[str], hyp: List[str], return_eps_padded_hyp: bool = False):\n", + " \"\"\"Get detailed WER labels, aligning reference with hypothesis.\n", + " \n", + " Possible WER labels:\n", + " - 'C' for Correct,\n", + " - 'I' for Insertion,\n", + " - 'D' for Deletion,\n", + " - 'S' for Substitution.\n", + "\n", + " Returns:\n", + " WER labels list.\n", + " [Optional] Epsilin-padded hypothesis if return_eps_padded_hyp set to True.\n", + " \"\"\"\n", + "\n", + " # Align reference and hypothesis using \"\"\n", + " aligned_ref, aligned_hyp = texterrors.align_texts(ref, hyp, False)[:-1]\n", + "\n", + " # Determine labels\n", + " labels = []\n", + " for r, h in zip(aligned_ref, aligned_hyp):\n", + " if r == h:\n", + " labels.append(\"C\")\n", + " elif r == \"\":\n", + " labels.append(\"I\")\n", + " elif h == \"\":\n", + " labels.append(\"D\")\n", + " else:\n", + " labels.append(\"S\")\n", + "\n", + " return labels if not return_eps_padded_hyp else labels, aligned_hyp\n", + "\n", + "\n", + "def fill_confidence_deletions(confidence_scores: List[float], labels: List[str], fill_value: float = 0.0):\n", + " \"\"\"Fill confidence scores list with the provided value for deletions.\n", + " Assumes that we have no natural confidence scores for deletions.\n", + " \n", + " Returns:\n", + " Confidence scores list with deletion scores.\n", + " \"\"\"\n", + "\n", + " assert len(confidence_scores) <= len(labels)\n", + "\n", + " # If the lengths of confidence_scores and labels are equal, then we assume that there are no deletions\n", + " if len(confidence_scores) == len(labels):\n", + " return confidence_scores\n", + "\n", + " # Insert fill_value into confidence_scores where label == \"D\"\n", + " new_confidence_scores = []\n", + " score_index = 0\n", + " for label in labels:\n", + " if label == \"D\":\n", + " new_confidence_scores.append(fill_value)\n", + " else:\n", + " new_confidence_scores.append(confidence_scores[score_index])\n", + " score_index += 1\n", + " return new_confidence_scores\n", + "\n", + "\n", + "def pretty_pad_word_labels(labels: List[str], words: List[str]):\n", + " \"\"\"Pad word labels with dash for pretty printing.\n", + " Expects labels and words to have the same length.\n", + " \n", + " Returns:\n", + " Padded labels list.\n", + " \"\"\"\n", + " \n", + " # Check that words and labels without 'D' have the same length\n", + " assert len(words) == len(labels)\n", + "\n", + " # Pad the labels with dashes to align them with the words\n", + " padded_labels = []\n", + " for word, label in zip(words, labels):\n", + " label_len = len(word)\n", + " left_padding = (label_len - 1) // 2\n", + " right_padding = label_len - left_padding - 1\n", + " padded_label = \"-\" * left_padding + label + \"-\" * right_padding\n", + " padded_labels.append(padded_label)\n", + "\n", + " return padded_labels\n", + "\n", + "\n", + "def _html_paint_word_grey(word: str, shade: str):\n", + " if shade == \"black\":\n", + " color = \"0,0,0\"\n", + " elif shade == \"grey\":\n", + " color = \"150,150,150\"\n", + " elif shade == \"light_grey\":\n", + " color = \"200,200,200\"\n", + " else:\n", + " raise ValueError(\n", + " f\"`shade` has to be one of the following: `black`, `grey`, `light_grey`. Provided: `{shade}`\"\n", + " )\n", + " return f'{word}'\n", + "\n", + "\n", + "def pretty_print_transcript_with_confidence(\n", + " transcript: str,\n", + " confidence_scores: List[float],\n", + " threshold: float,\n", + " reference: Optional[str] = None,\n", + " terminal_width: int = 120,\n", + " html: bool = False,\n", + "):\n", + " if html:\n", + " shade_if_low_confidence = lambda x, y: _html_paint_word_grey(x, 'light_grey' if y < threshold else 'black')\n", + " new_line_mark = \"
\"\n", + " pretty_print = lambda x: display(HTML(\"\" + new_line_mark.join(x) + \"\"))\n", + " else:\n", + " shade_if_low_confidence = lambda x, y: colored(x, 'light_grey') if y < threshold else x\n", + " new_line_mark = \"\\n\"\n", + " pretty_print = lambda x: print(new_line_mark.join(x))\n", + " with_labels = reference is not None\n", + " transcript_list = transcript.split()\n", + " output_lines = []\n", + " if with_labels:\n", + " reference_list = reference.split()\n", + " labels, eps_padded_hyp = get_detailed_wer_labels(reference_list, transcript_list, True)\n", + " padded_labels = pretty_pad_word_labels(labels, eps_padded_hyp)\n", + " current_line_len = 0\n", + " current_word_line = \"\"\n", + " current_label_line = \"\"\n", + " for word, label, padded_label, score in zip(\n", + " eps_padded_hyp, labels, padded_labels, fill_confidence_deletions(confidence_scores, labels)\n", + " ):\n", + " word_len = len(word)\n", + " # shield angle brakets for \n", + " if html and word == \"\":\n", + " word = \"<eps>\"\n", + " if current_line_len + word_len + 1 <= terminal_width:\n", + " if current_line_len > 0:\n", + " current_line_len += 1\n", + " current_word_line += \" \"\n", + " current_label_line += \"-\"\n", + " current_line_len += word_len\n", + " current_word_line += shade_if_low_confidence(word, score)\n", + " current_label_line += padded_label\n", + " else:\n", + " output_lines.append(current_word_line + new_line_mark + current_label_line)\n", + " current_line_len = word_len\n", + " current_word_line = shade_if_low_confidence(word, score)\n", + " current_label_line = padded_label\n", + " if current_word_line:\n", + " output_lines.append(current_word_line + new_line_mark + current_label_line)\n", + " else:\n", + " current_line_len = 0\n", + " current_word_line = \"\"\n", + " for word, score in zip(transcript_list, confidence_scores):\n", + " word_len = len(word)\n", + " # shield angle brakets for \n", + " if html and word == \"\":\n", + " word = \"<eps>\"\n", + " if current_line_len + word_len + 1 <= terminal_width:\n", + " if current_line_len > 0:\n", + " current_line_len += 1\n", + " current_word_line += \" \"\n", + " current_line_len += word_len\n", + " current_word_line += shade_if_low_confidence(word, score)\n", + " else:\n", + " output_lines.append(current_word_line)\n", + " current_line_len = word_len\n", + " current_word_line = shade_if_low_confidence(word, score)\n", + " if current_word_line:\n", + " output_lines.append(current_word_line)\n", + "\n", + " pretty_print(output_lines)" + ] + }, + { + "cell_type": "markdown", + "id": "ed997bfd", + "metadata": { + "id": "dec57a27" + }, + "source": [ + "## 3.2. Data and model loading\n", + "This tutorial uses CTC and RNN-T Conformer models trained on LibriSpeech.\n", + "\n", + "You can try to use other pre-trained models as well." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70c1a27a", + "metadata": { + "id": "b66c60a3" + }, + "outputs": [], + "source": [ + "from dataclasses import dataclass\n", + "from omegaconf import DictConfig, OmegaConf\n", + "\n", + "from nemo.collections.asr.models import ASRModel\n", + "\n", + "def load_model(name: str):\n", + " \"\"\"Load a pre-trained model.\n", + "\n", + " Args:\n", + " name: Pre-trained model name.\n", + " Reserved names:\n", + " - 'ctc' for 'stt_en_conformer_ctc_large_ls'\n", + " - 'rnnt' for 'stt_en_conformer_transducer_large_ls'\n", + "\n", + " Returns:\n", + " A model loaded into GPU with .eval() mode set.\n", + " \"\"\"\n", + " if name == \"ctc\":\n", + " name = \"stt_en_conformer_ctc_large_ls\"\n", + " elif name == \"rnnt\":\n", + " name = \"stt_en_conformer_transducer_large_ls\"\n", + "\n", + " model = ASRModel.from_pretrained(model_name=name, map_location=\"cuda:0\")\n", + " model.eval()\n", + "\n", + " return model\n", + "\n", + "@dataclass\n", + "class TestSet:\n", + " filepaths: List[str]\n", + " reference_texts: List[str]\n", + " durations: List[float]\n", + "\n", + "def load_data(manifest_path: str):\n", + " filepaths = []\n", + " reference_texts = []\n", + " durations = []\n", + " with open(manifest_path, \"r\") as f:\n", + " for line in f:\n", + " item = json.loads(line)\n", + " audio_file = item[\"audio_filepath\"]\n", + " filepaths.append(str(audio_file))\n", + " text = item[\"text\"]\n", + " reference_texts.append(text)\n", + " durations.append(float(item[\"duration\"]))\n", + " return TestSet(filepaths, reference_texts, durations)\n", + "\n", + "TEST_MANIFESTS = {\n", + " \"test_other\": DATA_DIR + \"/test_other.json\",\n", + "}\n", + "\n", + "\n", + "# Load data\n", + "test_sets = {manifest: load_data(path) for manifest, path in TEST_MANIFESTS.items()}\n", + "\n", + "# Load model\n", + "is_rnnt = False\n", + "# is_rnnt = True\n", + "\n", + "model = load_model(\"rnnt\" if is_rnnt else \"ctc\")" + ] + }, + { + "cell_type": "markdown", + "id": "9c5db700", + "metadata": { + "id": "88c3d7ee" + }, + "source": [ + "## 3.3. Setting up confidence estimation\n", + "To set up confidence estimation for NeMo ASR models, you need to:\n", + "1. Initialize _ConfidenceConfig_\n", + "2. Put the created _ConfidenceConfig_ into the model decoding config.\n", + "\n", + "The folloving cell contains an example of _ConfidenceConfig_ initialization and updating the the model's decoding config.\n", + "\n", + "For the _ConfidenceConfig_ there are also listed possible values for its parameters.\n", + "\n", + "Note that only `strategy=\"greedy\"` (or `greedy_batch` for RNN-T) supports computing confidence scores." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d3e8c11", + "metadata": { + "id": "078005f1" + }, + "outputs": [], + "source": [ + "from nemo.collections.asr.metrics.rnnt_wer import RNNTDecodingConfig\n", + "from nemo.collections.asr.metrics.wer import CTCDecodingConfig\n", + "from nemo.collections.asr.parts.utils.asr_confidence_utils import (\n", + " ConfidenceConfig,\n", + " ConfidenceConstants,\n", + " ConfidenceMeasureConfig,\n", + " ConfidenceMeasureConstants,\n", + ")\n", + "from nemo.collections.asr.parts.utils.asr_confidence_benchmarking_utils import (\n", + " apply_confidence_parameters,\n", + " get_correct_marks,\n", + " get_token_targets_with_confidence,\n", + " get_word_targets_with_confidence,\n", + ")\n", + "\n", + "\n", + "# List allowed options for ConfidenceMeasureConfig and ConfidenceConfig\n", + "print(f\"Allowed options for ConfidenceMeasureConfig: {ConfidenceMeasureConstants.print()}\\n\")\n", + "print(f\"Allowed options for ConfidenceConfig: {ConfidenceConstants.print()}\\n\")\n", + "\n", + "# Initialize ConfidenceConfig and ConfidenceMeasureConfig\n", + "confidence_cfg = ConfidenceConfig(\n", + " preserve_frame_confidence=True, # Internally set to true if preserve_token_confidence == True\n", + " # or preserve_word_confidence == True\n", + " preserve_token_confidence=True, # Internally set to true if preserve_word_confidence == True\n", + " preserve_word_confidence=True,\n", + " aggregation=\"prod\", # How to aggregate frame scores to token scores and token scores to word scores\n", + " exclude_blank=False, # If true, only non-blank emissions contribute to confidence scores\n", + " measure_cfg=ConfidenceMeasureConfig( # Config for per-frame scores calculation (before aggregation)\n", + " name=\"max_prob\", # Or \"entropy\" (default), which usually works better\n", + " entropy_type=\"gibbs\", # Used only for name == \"entropy\". Recommended: \"tsallis\" (default) or \"renyi\"\n", + " alpha=0.5, # Low values (<1) increase sensitivity, high values decrease sensitivity\n", + " entropy_norm=\"lin\" # How to normalize (map to [0,1]) entropy. Default: \"exp\"\n", + " )\n", + ")\n", + "\n", + "# Alternalively, look at ConfidenceConfig's docstring\n", + "print(f\"More info on ConfidenceConfig here:\\n{ConfidenceConfig().__doc__}\\n\")\n", + "\n", + "# Put the created ConfidenceConfig into the model decoding config via .change_decoding_strategy()\n", + "model.change_decoding_strategy(\n", + " RNNTDecodingConfig(fused_batch_size=-1, strategy=\"greedy_batch\", confidence_cfg=confidence_cfg)\n", + " if is_rnnt\n", + " else CTCDecodingConfig(confidence_cfg=confidence_cfg)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "04581687", + "metadata": { + "id": "efe0baea" + }, + "source": [ + "## 3.4. Decode test set and get transcriptions with confidence scores\n", + "Let's transcribe Librispeech _test-other_ and see what confidence scores are inside." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5f92257", + "metadata": { + "id": "ccd8d0de" + }, + "outputs": [], + "source": [ + "current_test_set = test_sets[\"test_other\"]\n", + "transcriptions = model.transcribe(paths2audio_files=current_test_set.filepaths, batch_size=16, return_hypotheses=True, num_workers=4)\n", + "if is_rnnt:\n", + " transcriptions = transcriptions[0]" + ] + }, + { + "cell_type": "markdown", + "id": "ca282352", + "metadata": { + "id": "0500514e" + }, + "source": [ + "For a transcribed hypothesis, there can be `frame_confidence` and aggregated from them `token_confidence` and `word_confidence`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18663384", + "metadata": { + "id": "98035fd2" + }, + "outputs": [], + "source": [ + "tran = transcriptions[0]\n", + "print(\n", + " f\"\"\" Recognized text: `{tran.text}`\\n\n", + " Word confidence: {[round(c, 3) for c in tran.word_confidence]}\\n\n", + " Token confidence: {[round(c, 3) for c in tran.token_confidence]}\\n\n", + " Frame confidence: {[([round(cc, 3) for cc in c] if is_rnnt else round(c, 3)) for c in tran.frame_confidence]}\"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "783e9e2a", + "metadata": { + "id": "9613bfc1" + }, + "source": [ + "Now let's draw the recognition results highlighted according to their confidence scores.\n", + "\n", + "There are four options: plain text and HTML with or without WER labels." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "642fe059", + "metadata": { + "id": "a83295ff" + }, + "outputs": [], + "source": [ + "from nemo.collections.asr.metrics.wer import word_error_rate, word_error_rate_detail, word_error_rate_per_utt\n", + "\n", + "def show_dataset_with_confidence(\n", + " indices,\n", + " transcriptions,\n", + " test_set,\n", + " threshold,\n", + " filepaths=None,\n", + " html_show=False,\n", + " min_dur_to_show=0.0,\n", + " utt_to_show=10\n", + "):\n", + " utt_shown = 0\n", + " for i, _ in indices:\n", + " if utt_shown >= utt_to_show:\n", + " break\n", + " if test_set.durations[i] >= min_dur_to_show:\n", + " print(\"=\"*120)\n", + " hyp = transcriptions[i].text\n", + " scores = transcriptions[i].word_confidence\n", + " ref = test_set.reference_texts[i]\n", + " pretty_print_transcript_with_confidence(hyp, scores, threshold, ref, html=html_show)\n", + " if filepaths is not None:\n", + " display(Audio(filepaths[i]))\n", + " utt_shown += 1\n", + "\n", + "\n", + "# you can play with these parameters\n", + "threshold = 0.52\n", + "# in colab, you may want to use `html_show = True` as non-html colorion displayed incorrectly in colab\n", + "html_show = is_colab\n", + "min_dur_to_show = 4.0\n", + "utt_to_show = 5\n", + "\n", + "wer_per_utt, avg_wer = word_error_rate_per_utt([h.text for h in transcriptions], current_test_set.reference_texts)\n", + "sorted_wer_indices = sorted(enumerate(wer_per_utt), key=lambda x: x[1])[::-1]\n", + "\n", + "show_dataset_with_confidence(\n", + " indices=sorted_wer_indices,\n", + " transcriptions=transcriptions,\n", + " test_set=current_test_set,\n", + " threshold=threshold,\n", + " filepaths=current_test_set.filepaths,\n", + " html_show=html_show,\n", + " min_dur_to_show=min_dur_to_show,\n", + " utt_to_show=utt_to_show\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9468ad3e", + "metadata": { + "id": "dbfcb2da" + }, + "source": [ + "## 3.5. Confidence metrics\n", + "\n", + "There are several metrics to evaluate the effectiveness of a confidence estimation method. Some of them consider confidence estimation as a binary classification task. Other measure how close the correct word confidence scores are to $1.0$ and the incorrect word scores are to $0.0$.\n", + "\n", + "Some of them are:\n", + "1. Area Under the Receiver Operating Characteristics Curve ($\\mathrm{AUC}_\\mathrm{ROC}$): class separability metric.\n", + "2. Area Under the Precision-Recall Curve ($\\mathrm{AUC}_\\mathrm{PR}$): how well the correct words are detected.\n", + "3. Area Under the Negative Predictive Value vs. True Negative Rate Curve ($\\mathrm{AUC}_\\mathrm{NT}$): how well the incorrect words are detected ($\\mathrm{AUC}_\\mathrm{PR}$ in which errors are treated as positives).\n", + "4. Normalized Cross Entropy ($\\mathrm{NCE}$): how close of confidence for correct predictions to $1.0$ and of incorrect predictions to $0.0$. It ranges from $-\\infty$ to $1.0$, with negative scores indicating that the confidence method performs worse than the setting confidence score to $1-\\mathrm{WER}$. This metric is also known as Normalized Mutual Information.\n", + "5. Expected Calibration Error ($\\mathrm{ECE}$): a weighted average over the absolute accuracy/confidence difference. It ranges from $0.0$ to $1.0$ with the best value $0.0$.\n", + "\n", + "Metrics based on the Youden's curve (see https://en.wikipedia.org/wiki/Youden%27s_J_statistic) can also be condsidered. They are:\n", + "1. Area Under the Youden's curve ($\\mathrm{AUC}_\\mathrm{YC}$): the rate of the effective threshold range (i.e. the adjustability or responsiveness). It ranges from $0.0$ to $1.0$ with the best value $0.5$.\n", + "2. Maximum of the Youden's curve $\\mathrm{MAX}_\\mathrm{YC}$: the optimal $\\mathrm{TNR}$ vs. $\\mathrm{FNR}$ tradeoff. It's unnormalized version can be used as a criterion for selecting the optimal $\\tau$. It ranges from $0.0$ to $1.0$ with the best value $1.0$.\n", + "3. The standard deviation of the Youden's curve values ($\\mathrm{STD}_\\mathrm{YC}$): indicates that $\\mathrm{TNR}$ and $\\mathrm{FNR}$ increase at different rates (viz. $\\mathrm{TNR}$ grows faster) as the $\\tau$ increases. It ranges from $0.0$ to $0.5$ with the best value around $0.25$.\n", + "\n", + "When selecting/tuning a confidence method, it is recommended to maximize $\\mathrm{AUC}_\\mathrm{ROC}$ first as this is the main mectic of confidence estimation quality. Then, for overconfident models, maximizing $\\mathrm{AUC}_\\mathrm{NT}$ should take precedence over $\\mathrm{AUC}_\\mathrm{PR}$. Finally, a trade-off between $\\mathrm{NCE}$/$\\mathrm{ECE}$ and the family of $\\mathrm{YC}$ metrics considered as a compromise between formal correctness and controllability.\n", + "\n", + "Let's see how well our confidence performs according to the metrcis above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b0fa793", + "metadata": { + "id": "5d152775" + }, + "outputs": [], + "source": [ + "from nemo.collections.asr.parts.utils.confidence_metrics import (\n", + " auc_nt,\n", + " auc_pr,\n", + " auc_roc,\n", + " auc_yc,\n", + " ece,\n", + " nce,\n", + " save_confidence_hist,\n", + " save_custom_confidence_curve,\n", + " save_nt_curve,\n", + " save_pr_curve,\n", + " save_roc_curve,\n", + ")\n", + "\n", + "\n", + "targets_with_confidence = [get_word_targets_with_confidence(tran) for tran in transcriptions]\n", + "correct_marks = [get_correct_marks(r.split(), h.words) for r, h in zip(current_test_set.reference_texts, transcriptions)]\n", + "\n", + "y_true, y_score = np.array(\n", + " [[f, p[1]] for cm, twc in zip(correct_marks, targets_with_confidence) for f, p in zip(cm, twc)]\n", + ").T\n", + "\n", + "\n", + "# output scheme: yc.mean(), yc.max(), yc.std() or yc.mean(), yc.max(), yc.std(), (thresholds, yc)\n", + "result_yc = auc_yc(y_true, y_score, return_std_maximum=True, return_curve=True)\n", + "# output scheme: ece or ece, (thresholds, ece_curve)\n", + "results_ece = ece(y_true, y_score, return_curve=True)\n", + "results = [\n", + " auc_roc(y_true, y_score),\n", + " auc_pr(y_true, y_score),\n", + " auc_nt(y_true, y_score),\n", + " nce(y_true, y_score),\n", + " results_ece[0],\n", + "] + list(result_yc[:3])\n", + "\n", + "print(\n", + " f\"\"\" AUC_ROC:\\t{results[0]:.5f}\n", + " AUC_PR:\\t{results[1]:.5f}\n", + " AUC_NT:\\t{results[2]:.5f}\n", + " NCE:\\t{results[3]:.5f}\n", + " ECE:\\t{results[4]:.5f}\n", + " AUC_YC:\\t{results[5]:.5f}\n", + " MAX_YC:\\t{results[7]:.5f}\n", + " STD_YC:\\t{results[6]:.5f}\n", + " \"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0c3f6299", + "metadata": { + "id": "4159034d" + }, + "source": [ + "Confidence metrics for the maximum probability confidence are not that great.\n", + "\n", + "Let's re-run and benchmark confidence estimation with the default confidence estimator." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c0e3a9f", + "metadata": { + "id": "d2e16f5f" + }, + "outputs": [], + "source": [ + "confidence_cfg = ConfidenceConfig(\n", + " preserve_word_confidence=True,\n", + " preserve_token_confidence=True,\n", + ")\n", + "\n", + "model.change_decoding_strategy(\n", + " RNNTDecodingConfig(fused_batch_size=-1, strategy=\"greedy_batch\", confidence_cfg=confidence_cfg)\n", + " if is_rnnt\n", + " else CTCDecodingConfig(confidence_cfg=confidence_cfg)\n", + ")\n", + "\n", + "transcriptions = model.transcribe(paths2audio_files=current_test_set.filepaths, batch_size=16, return_hypotheses=True, num_workers=4)\n", + "if is_rnnt:\n", + " transcriptions = transcriptions[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8f1cc77", + "metadata": { + "id": "6201ea4d" + }, + "outputs": [], + "source": [ + "targets_with_confidence = [get_word_targets_with_confidence(tran) for tran in transcriptions]\n", + "correct_marks = [get_correct_marks(r.split(), h.words) for r, h in zip(current_test_set.reference_texts, transcriptions)]\n", + "\n", + "y_true, y_score = np.array(\n", + " [[f, p[1]] for cm, twc in zip(correct_marks, targets_with_confidence) for f, p in zip(cm, twc)]\n", + ").T\n", + "\n", + "result_yc = auc_yc(y_true, y_score, return_std_maximum=True, return_curve=True)\n", + "results_ece = ece(y_true, y_score, return_curve=True)\n", + "results = [\n", + " auc_roc(y_true, y_score),\n", + " auc_pr(y_true, y_score),\n", + " auc_nt(y_true, y_score),\n", + " nce(y_true, y_score),\n", + " results_ece[0],\n", + "] + list(result_yc[:3])\n", + "\n", + "print(\n", + " f\"\"\" AUC_ROC:\\t{results[0]:.5f}\n", + " AUC_PR:\\t{results[1]:.5f}\n", + " AUC_NT:\\t{results[2]:.5f}\n", + " NCE:\\t{results[3]:.5f}\n", + " ECE:\\t{results[4]:.5f}\n", + " AUC_YC:\\t{results[5]:.5f}\n", + " MAX_YC:\\t{results[7]:.5f}\n", + " STD_YC:\\t{results[6]:.5f}\n", + " \"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9ab2b130", + "metadata": { + "id": "498e03d0" + }, + "source": [ + "Note that despite the overall improvement, $NCE$ and $ECE$ have gotten worse. This is due to class imbalance caused by low WER." + ] + }, + { + "cell_type": "markdown", + "id": "f96cea04", + "metadata": { + "id": "45856cba" + }, + "source": [ + "Now, let's draw $\\mathrm{ROC}$ as well as histograms of correctly and incorrectly recognized words." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81844713", + "metadata": { + "id": "ff049043" + }, + "outputs": [], + "source": [ + "from tempfile import TemporaryDirectory\n", + "\n", + "\n", + "plot_dir = TemporaryDirectory()\n", + "os.makedirs(plot_dir.name, exist_ok=True)\n", + "\n", + "mask_correct = y_true == 1\n", + "y_score_correct = y_score[mask_correct]\n", + "y_score_incorrect = y_score[~mask_correct]\n", + "\n", + "# histogram of the correct distribution\n", + "save_confidence_hist(y_score_correct, plot_dir.name, \"hist_correct\")\n", + "# histogram of the incorrect distribution\n", + "save_confidence_hist(y_score_incorrect, plot_dir.name, \"hist_incorrect\")\n", + "# AUC-ROC curve\n", + "save_roc_curve(y_true, y_score, plot_dir.name, \"roc\")\n", + "\n", + "\n", + "display(\n", + " Image(filename=os.path.join(plot_dir.name, \"hist_correct.png\"), retina=True),\n", + " Image(filename=os.path.join(plot_dir.name, \"hist_incorrect.png\"), retina=True),\n", + " Image(filename=os.path.join(plot_dir.name, \"roc.png\"), retina=True),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "841a27ca", + "metadata": {}, + "source": [ + "Optionally, you can look at curves for other metrics ($\\mathrm{PR}$, $\\mathrm{NT}$, $\\mathrm{ECE}$, and $\\mathrm{YC}$)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6164e8f0", + "metadata": {}, + "outputs": [], + "source": [ + "# AUC-PR curve\n", + "save_pr_curve(y_true, y_score, plot_dir.name, \"pr\")\n", + "# AUC-NT curve\n", + "save_nt_curve(y_true, y_score, plot_dir.name, \"nt\")\n", + "# ECE curve\n", + "ece_thresholds, ece_values = results_ece[-1]\n", + "ece_values /= max(ece_values)\n", + "save_custom_confidence_curve(\n", + " ece_thresholds, ece_values, plot_dir.name, \"ece\", \"Threshold\", \"|Accuracy − Confidence score|\"\n", + ")\n", + "# AUC-YC curve\n", + "yc_thresholds, yc_values = result_yc[-1]\n", + "save_custom_confidence_curve(\n", + " yc_thresholds, yc_values, plot_dir.name, \"yc\", \"Threshold\", \"True positive rate − False Positive Rate\"\n", + ")\n", + "\n", + "\n", + "display(\n", + " Image(filename=os.path.join(plot_dir.name, \"pr.png\"), retina=True),\n", + " Image(filename=os.path.join(plot_dir.name, \"nt.png\"), retina=True),\n", + " Image(filename=os.path.join(plot_dir.name, \"ece.png\"), retina=True),\n", + " Image(filename=os.path.join(plot_dir.name, \"yc.png\"), retina=True),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9f63a172", + "metadata": { + "id": "ad78630a" + }, + "source": [ + "You can use `scripts/speech_recognition/confidence/benchmark_asr_confidence.py` to find optimal confidence hyperparameters." + ] + }, + { + "cell_type": "markdown", + "id": "1d9a822d", + "metadata": { + "id": "15e25521" + }, + "source": [ + "# 4. Confidence applications" + ] + }, + { + "cell_type": "markdown", + "id": "8ab6e666", + "metadata": { + "id": "dbb82877" + }, + "source": [ + "## 4.1. Small WER improvenent\n", + "\n", + "Good confidence scores can slightly reduce WER by removing low confidence words from recognition results.\n", + "\n", + "Consider the following example." + ] + }, + { + "cell_type": "markdown", + "id": "4038863c", + "metadata": { + "id": "02eb4e1f" + }, + "source": [ + "Let's look at the detailed WER of the transcribed test set before and after removing words with low confidence score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "204d36ac", + "metadata": { + "id": "fdf790b5" + }, + "outputs": [], + "source": [ + "drop_low_confidence_words = lambda x, y, z: \" \".join([xx for xx, yy in zip(x.split(), y) if yy >= z])\n", + "\n", + "\n", + "threshold = 0.001\n", + "\n", + "wer_initial = word_error_rate_detail([h.text for h in transcriptions], current_test_set.reference_texts)\n", + "print(\n", + " f\"\"\"WER detail before removing low confidence words:\n", + " WER:\\t{wer_initial[0]:.5f}\n", + " INS_rate:\\t{wer_initial[2]:.5f}\n", + " DEL_rate:\\t{wer_initial[3]:.5f}\n", + " SUB_rate:\\t{wer_initial[4]:.5f}\"\"\"\n", + ")\n", + "\n", + "wer_conf_dropped = word_error_rate_detail(\n", + " [drop_low_confidence_words(hyp.text, hyp.word_confidence, threshold) for hyp in transcriptions],\n", + " current_test_set.reference_texts,\n", + ")\n", + "print(\n", + " f\"\"\"WER detail after removing low confidence words:\n", + " WER:\\t{wer_conf_dropped[0]:.5f}\n", + " INS_rate:\\t{wer_conf_dropped[2]:.5f}\n", + " DEL_rate:\\t{wer_conf_dropped[3]:.5f}\n", + " SUB_rate:\\t{wer_conf_dropped[4]:.5f}\"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "4f153cdd", + "metadata": { + "id": "28ac85b1" + }, + "source": [ + "You can see that with the right (in this example, extremely low) `threshold` can reduce WER by a tiny bit, reducing insertions and substitutions yet increasing deletions.\n", + "\n", + "Now let's see how to find the optimal threshold.\n", + "\n", + "The most commonly used method for automatically determining the optimal cutoff threshold is taking the value which delivers the maximum of the unnormalized Youden's curve. This method allows you to remove the largest number of incorrect entities, sacrificing the minimum number of correct entities.\n", + "\n", + "However, the unnormalized $\\mathrm{MAX}_\\mathrm{YC}$ method does not work well for the purpose of the WER reduction. Let's compare this method to explicitly minimizing WER with respect to a threshold." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19147b4a", + "metadata": { + "id": "9b81e449" + }, + "outputs": [], + "source": [ + "from joblib import Parallel, delayed\n", + "from multiprocessing import cpu_count\n", + "from tqdm.notebook import tqdm\n", + "\n", + "def max_unnnormalized_yc(\n", + " y_true: Union[List[int], np.ndarray],\n", + " y_score: Union[List[float], np.ndarray],\n", + " n_bins: int = 100,\n", + " start: float = 0.0,\n", + " stop: float = 1.0,\n", + "):\n", + " \"\"\"Calculate the maximum of the unnormalized Youden's curve.\n", + " \"\"\"\n", + " y_true = np.array(y_true)\n", + " y_score = np.array(y_score)\n", + " thresholds = np.linspace(start, stop, n_bins + 1)\n", + " assert len(y_true) == len(y_score)\n", + " assert np.all(y_true >= 0) and np.all(y_true <= 1)\n", + " if np.all(y_true == 0) or np.all(y_true == 1):\n", + " return 0.0, 0.0\n", + " mask_correct = y_true == 1\n", + " y_score_correct = y_score[mask_correct]\n", + " y_score_incorrect = y_score[~mask_correct]\n", + " unnnormalized_yc = []\n", + " for threshold in thresholds:\n", + " tn = len((y_score_incorrect < threshold).nonzero()[0])\n", + " fn = len((y_score_correct < threshold).nonzero()[0])\n", + " unnnormalized_yc.append((threshold, tn - fn))\n", + " return max(unnnormalized_yc, key=lambda x: x[1])[0]\n", + "\n", + "\n", + "def min_wer(ref: List[str], transcriptions, n_bins: int = 100, start: float = 0.0, stop: float = 1.0):\n", + " \"\"\"Find the threshold value that delivers the minimum WER.\n", + " \"\"\"\n", + " thresholds = np.linspace(start, stop, n_bins + 1)\n", + " hyp = [(hyp.text, hyp.word_confidence) for hyp in transcriptions]\n", + " _get_wer = lambda x, y, z: (x, word_error_rate_detail([drop_low_confidence_words(yy[0], yy[1], x) for yy in y], z)[0])\n", + " wers = Parallel(n_jobs=cpu_count())(delayed(_get_wer)(threshold, hyp, ref) for threshold in tqdm(thresholds))\n", + " return min(wers, key=lambda x: x[1])\n", + "\n", + "\n", + "targets_with_confidence = [get_word_targets_with_confidence(tran) for tran in transcriptions]\n", + "correct_marks = [\n", + " get_correct_marks(r.split(), h.words) for r, h in zip(current_test_set.reference_texts, transcriptions)\n", + "]\n", + "y_true, y_score = np.array(\n", + " [[f, p[1]] for cm, twc in zip(correct_marks, targets_with_confidence) for f, p in zip(cm, twc)]\n", + ").T\n", + "\n", + "threshold_yc = max_unnnormalized_yc(y_true, y_score)\n", + "yc_wer_value = word_error_rate(\n", + " [drop_low_confidence_words(hyp.text, hyp.word_confidence, threshold_yc) for hyp in transcriptions],\n", + " current_test_set.reference_texts,\n", + ")\n", + "threshold_min_wer, min_wer_value = min_wer(current_test_set.reference_texts, transcriptions, stop=0.1)\n", + "\n", + "print(\n", + " f\"\"\" Initial WER: {wer_initial[0]:.5f}\n", + " Optimal threshold and WER based on the Youden's curve: {threshold_yc}, {yc_wer_value:.5f}\n", + " Optimal threshold for the minimum WER: {threshold_min_wer}, {min_wer_value:.5f}\n", + " \"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "425d010e", + "metadata": { + "id": "3b278d2d" + }, + "source": [ + "As you can see, the optimal cutoff threshold as the maximum of the Youden's curve makes WER significantly worse, and the optimal threshold for the minimum WER is near zero.\n", + "\n", + "Let's use a different confidence estimation setup to see if we can improve WER at least a bit further." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d080686", + "metadata": { + "id": "39f72c78" + }, + "outputs": [], + "source": [ + "confidence_cfg = ConfidenceConfig(\n", + " preserve_word_confidence=True,\n", + " preserve_token_confidence=True,\n", + " aggregation=\"min\",\n", + " measure_cfg=DictConfig({\"entropy_type\": \"tsallis\", \"alpha\": 1.5, \"entropy_norm\": \"lin\"}),\n", + ")\n", + "\n", + "model.change_decoding_strategy(\n", + " RNNTDecodingConfig(fused_batch_size=-1, strategy=\"greedy_batch\", confidence_cfg=confidence_cfg)\n", + " if is_rnnt\n", + " else CTCDecodingConfig(confidence_cfg=confidence_cfg)\n", + ")\n", + "\n", + "transcriptions = model.transcribe(paths2audio_files=current_test_set.filepaths, batch_size=16, return_hypotheses=True, num_workers=4)\n", + "if is_rnnt:\n", + " transcriptions = transcriptions[0]\n", + "\n", + "threshold_min_wer, min_wer_value = min_wer(current_test_set.reference_texts, transcriptions)\n", + "\n", + "print(\n", + " f\"\"\" Initial WER: {wer_initial[0]:.5f}\n", + " Optimal threshold for the minimum WER: {threshold_min_wer}, {min_wer_value:.5f}\n", + " \"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e3c9cc02", + "metadata": { + "id": "e00581b1" + }, + "source": [ + "Overall, such an improvement in WER is too small to be considered. However, this opens up the possibility of improving WER through the use of more accurate confidence estimation methods." + ] + }, + { + "cell_type": "markdown", + "id": "694d1752", + "metadata": { + "id": "f9f89665" + }, + "source": [ + "## 4.2. Reducing hallucinations with confidence scores\n", + "\n", + "One common application of confidence scores is the removal of recognition hallucinations.\n", + "\n", + "Let's see how this can be done." + ] + }, + { + "cell_type": "markdown", + "id": "98a1ef83", + "metadata": { + "id": "c1c28379" + }, + "source": [ + "Firstly, let's obtain a dataset on which the ASR model can hallucinate.\n", + "\n", + "Here we make it from the librosa examples, reversing them and convolving with each other." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f12a5041", + "metadata": { + "id": "3b0a0b4c" + }, + "outputs": [], + "source": [ + "from itertools import combinations\n", + "import json\n", + "import librosa\n", + "import soundfile as sf\n", + "\n", + "def cyclic_sum(x, y):\n", + " if x.shape[0] < y.shape[0]:\n", + " x, y = y, x\n", + " if x.shape[0] > y.shape[0]:\n", + " y = np.take(y, range(0, x.shape[0]), mode='wrap')\n", + " return x + y\n", + "\n", + "def generate_noise_examples(example_list: List[str], save_dir: str, samplerate: int = 16000):\n", + " \"\"\"Generate noise examples with librosa.\n", + " It loads the selected example, inverts and perturbs them with each other.\n", + "\n", + " Returns:\n", + " A manifest with the noise wavs.\n", + " \"\"\"\n", + " samples = {ex: librosa.core.load(librosa.util.example(key=ex, hq=True), sr=samplerate)[0] \n", + " for ex in example_list}\n", + " noise_samples = {\"_\".join([left, right]): cyclic_sum(samples[left][::-1], samples[right][::-1]) \n", + " for left, right in combinations(samples.keys(), 2)}\n", + "\n", + " os.makedirs(save_dir, exist_ok=True)\n", + " manifest = os.path.join(save_dir, \"manifest.json\")\n", + " with open(manifest, \"tw\", encoding=\"utf-8\") as fout:\n", + " for k, v in noise_samples.items():\n", + " audio_path = os.path.join(save_dir, f\"{k}.wav\")\n", + " sf.write(audio_path, v, samplerate=samplerate)\n", + " metadata = {\n", + " \"audio_filepath\": audio_path,\n", + " \"duration\": librosa.core.get_duration(y=v, sr=samplerate),\n", + " \"label\": \"noise\",\n", + " \"text\": \"_\"\n", + " }\n", + " json.dump(metadata, fout)\n", + " fout.write('\\n')\n", + "\n", + " return manifest\n", + "\n", + "librosa_list_examples = ['brahms',\n", + " 'choice',\n", + " 'fishin',\n", + " 'humpback',\n", + " 'libri1',\n", + " 'libri2',\n", + " 'libri3',\n", + " 'nutcracker',\n", + " 'pistachio',\n", + " 'robin',\n", + " 'sweetwaltz',\n", + " 'trumpet',\n", + " 'vibeace']\n", + "sr = 16000\n", + "\n", + "noise_dir = os.path.join(DATA_DIR, \"noise\")\n", + "noise_manifest = generate_noise_examples(librosa_list_examples, noise_dir, sr)" + ] + }, + { + "cell_type": "markdown", + "id": "f28da61f", + "metadata": {}, + "source": [ + "The original examples contain speech, music, or noise. The resulring audio recordings are considered to contain no recognizable speech.\n", + "\n", + "You can listen to an example of the audios." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b4e7007", + "metadata": {}, + "outputs": [], + "source": [ + "noise_data = load_data(noise_manifest)\n", + "\n", + "display(Audio(noise_data.filepaths[0]))" + ] + }, + { + "cell_type": "markdown", + "id": "1db80ae4", + "metadata": { + "id": "f7f9ddca" + }, + "source": [ + "Now let's transcribe our new data, setting the default confidence estimator." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a872926", + "metadata": { + "id": "60f39094" + }, + "outputs": [], + "source": [ + "confidence_cfg = ConfidenceConfig(\n", + " preserve_word_confidence=True,\n", + " preserve_token_confidence=True,\n", + ")\n", + "\n", + "model.change_decoding_strategy(\n", + " RNNTDecodingConfig(fused_batch_size=-1, strategy=\"greedy_batch\", confidence_cfg=confidence_cfg)\n", + " if is_rnnt\n", + " else CTCDecodingConfig(confidence_cfg=confidence_cfg)\n", + ")\n", + "\n", + "noise_transcriptions = model.transcribe(\n", + " paths2audio_files=noise_data.filepaths, batch_size=4, return_hypotheses=True, num_workers=4\n", + ")\n", + "if is_rnnt:\n", + " noise_transcriptions = noise_transcriptions[0]" + ] + }, + { + "cell_type": "markdown", + "id": "3d097ca6", + "metadata": { + "id": "2f192186" + }, + "source": [ + "On a fully non-speech dataset, hallucinations can be measured as the Word Insertions per Second (WIS) value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19c6321c", + "metadata": { + "id": "3589da00" + }, + "outputs": [], + "source": [ + "def word_insertions_per_second(texts: List[str], durations: List[float]):\n", + " \"\"\"Calculate the Word Insertions per Second (WIS) value for the given recognition results \n", + " and their corresponding audio duration.\n", + " \"\"\"\n", + " assert len(texts) == len(durations)\n", + "\n", + " wis_per_utt = [len(text.split(\" \")) / duration for text, duration in zip(texts, durations)]\n", + " return sum(wis_per_utt) / len(wis_per_utt), wis_per_utt\n", + "\n", + "wis, wis_per_utt = word_insertions_per_second([t.text for t in noise_transcriptions], noise_data.durations)\n", + "print(f\"Original Word Insertions per Second: {wis:.5f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "bcf44daf", + "metadata": { + "id": "a0d8135d" + }, + "source": [ + "Now, the ability of a confidence estimator to detect hallucinations is computed as the Hallucination Detection Rate (HDR).\n", + "\n", + "It shows how many of all hallucinations can be removed, provided that no more than some fixed percentage of correct words are erroneously removed (under normal recognition conditions).\n", + "\n", + "HDR is another name of the metric $\\mathrm{TNR}_{FNR=e}$ which is calculated as $\\mathrm{TNR}(Y,\\tau): \\mathrm{FNR}(X,\\tau) \\approx e$, where $X$ is the dataset with supervision (to tune $\\tau$) and $Y$ is the noise-only dataset. Typical $e$ value is 0.05.\n", + "\n", + "Let's compute HDR and the new WIS.\n", + "\n", + "The generated dataset is clearly distinct from speech, so $e=0.01$ is sufficient." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3dac1f7d", + "metadata": { + "id": "0612ccf6" + }, + "outputs": [], + "source": [ + "def hdr(\n", + " y_true_speech: Union[List[int], np.ndarray],\n", + " y_score_speech: Union[List[float], np.ndarray],\n", + " y_score_noise: Union[List[float], np.ndarray],\n", + " max_fnr: float = 0.05,\n", + " n_bins: int = 100,\n", + ") -> Tuple[float, float]:\n", + " \"\"\"Compute Hallucination Detection Rate (HDR) from prediction scores.\n", + "\n", + " Returns:\n", + " tnr: True-Negateve Rate for HDR\n", + " threshold_hdr: Optomal threshold \n", + " \"\"\"\n", + " y_true_speech = np.array(y_true_speech)\n", + " y_score_speech = np.array(y_score_speech)\n", + " y_score_noise = np.array(y_score_noise)\n", + " thresholds = np.linspace(0, 1, n_bins + 1)\n", + " assert y_true_speech.shape[0] == y_score_speech.shape[0]\n", + " assert np.all(y_true_speech >= 0) and np.all(y_true_speech <= 1)\n", + " if np.all(y_true_speech == 0) or np.all(y_true_speech == 1):\n", + " return 0.0, 0.0\n", + " mask_correct = y_true_speech == 1\n", + " count_correct = max(mask_correct.nonzero()[0].shape[0], 1)\n", + " y_score_correct = y_score_speech[mask_correct]\n", + " threshold_hdr = 0.0\n", + " for threshold in thresholds:\n", + " fnr = (y_score_correct < threshold).nonzero()[0].shape[0] / count_correct\n", + " if fnr <= max_fnr:\n", + " threshold_hdr = threshold\n", + " else:\n", + " break\n", + " tnr = (y_score_noise < threshold_hdr).nonzero()[0].shape[0] / y_score_noise.shape[0]\n", + " return tnr, threshold_hdr\n", + "\n", + "\n", + "# e\n", + "max_fnr = 0.01\n", + "\n", + "correct_marks = [\n", + " mark for r, h in zip(current_test_set.reference_texts, transcriptions) for mark in get_correct_marks(r.split(), h.words)\n", + "]\n", + "y_score_speech = [w for h in transcriptions for w in h.word_confidence]\n", + "y_score_noise = [w for h in noise_transcriptions for w in h.word_confidence]\n", + "hdr_score, threshold_hdr = hdr(correct_marks, y_score_speech, y_score_noise, max_fnr=max_fnr)\n", + "wis_new = wis - wis * hdr_score\n", + "\n", + "hdr_score, wis_new\n", + "print(\n", + " f\"\"\" Hallucination Detection Rate for max_fnr={max_fnr} : {hdr_score:.5f}\n", + " New Word Insertions Per Second: {wis_new:.5f}\"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "443938bc", + "metadata": { + "id": "418297d6" + }, + "source": [ + "Finally, let's print the noisy utterances to see if any more hallucinations persist." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dde9e7db", + "metadata": { + "id": "3815e8e3" + }, + "outputs": [], + "source": [ + "sorted_wis_indices = sorted(enumerate(wis_per_utt), key=lambda x: x[1])[::-1]\n", + "\n", + "show_dataset_with_confidence(\n", + " indices=sorted_wis_indices,\n", + " transcriptions=noise_transcriptions,\n", + " test_set=noise_data,\n", + " threshold=threshold_hdr,\n", + " filepaths=noise_data.filepaths,\n", + " html_show=is_colab,\n", + " min_dur_to_show=0.0,\n", + " utt_to_show=5,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "66f92938", + "metadata": { + "id": "0ac58ef2" + }, + "source": [ + "# Summary\n", + "This tutorial covered the basics of ASR confidence estimation and two examples of using ASR word confidence: WER reduction and hallusinations removal.\n", + "\n", + "You can follow this tutorial on [ASR Confidence-based Ensembles](https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Confidence_Ensembles.ipynb) to see another important application of ASR confidence estimation." + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/asr/Confidence_Ensembles.ipynb b/tutorials/asr/Confidence_Ensembles.ipynb index f9617c75e36a..4516d2b70d6d 100644 --- a/tutorials/asr/Confidence_Ensembles.ipynb +++ b/tutorials/asr/Confidence_Ensembles.ipynb @@ -110,7 +110,7 @@ "\n", "### How to estimate a model's confidence?\n", "\n", - "Good news, we have a whole separate [tutorial](TBD) on this topic! You can go through it if you want to know all the details about different ways to estimate confidence of NeMo ASR models. There are different confidence measures and aggregation functions and for the absolute best performance, you will need to run a grid-search to pick the best confidence estimation way for your specific models and data.\n", + "Good news, we have a whole separate [tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/ASR_Confidence_Estimation.ipynb) on this topic! You can go through it if you want to know all the details about different ways to estimate confidence of NeMo ASR models. There are different confidence measures and aggregation functions and for the absolute best performance, you will need to run a grid-search to pick the best confidence estimation way for your specific models and data.\n", "\n", "That being said, we found that there exist a set of confidence parameters that work pretty well on a large set of models and datsets. They are default in NeMo and so you might not need to worry about running the search. If you do want to maximize the performance by tuning the confidence parameters, you only need to add [a few extra config lines](#Building-and-evaluating-ensemble-(tuned-parameters)).\n", "\n", From 2ef544ffe6daa80d38d0f494a7e42adcac50a4b9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 16 Jul 2023 18:19:47 -0700 Subject: [PATCH 113/123] install_bs (#7019) (#7028) Signed-off-by: Nikolay Karpov Co-authored-by: Nikolay Karpov --- .../ngram_lm/install_beamsearch_decoders.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh b/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh index 558a84698f49..3ba337a6afd3 100755 --- a/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh +++ b/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh @@ -26,14 +26,15 @@ KENLM_MAX_ORDER=10 # Maximum order of KenLM model, also specified in the setup_o cd $NEMO_PATH if [ $(id -u) -eq 0 ]; then - alias aptupdate='apt-get update' - alias b2install='./b2' -else - alias aptupdate='sudo apt-get update' - alias b2install='sudo ./b2' + alias aptupdate='apt-get update' + alias b2install='./b2' + else + alias aptupdate='sudo apt-get update' + alias b2install='sudo ./b2' fi -aptupdate && apt-get upgrade -y && apt-get install -y liblzma-dev && rm -rf /var/lib/apt/lists/* # liblzma needed for flashlight decoder' +aptupdate && apt-get upgrade -y && apt-get install -y liblzma-dev && rm -rf /var/lib/apt/lists/* # liblzma needed for flashlight decoder + git clone https://github.com/NVIDIA/OpenSeq2Seq cd OpenSeq2Seq From 8b4b3820cee4612ef49884df3edc0d035b47cd13 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 17 Jul 2023 21:50:04 -0700 Subject: [PATCH 114/123] fixes for spellmapper (#6994) (#7000) Signed-off-by: Alexandra Antonova Co-authored-by: bene-ges Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> --- .../create_custom_vocab_index.py | 2 +- .../run_infer.sh | 2 +- .../spellchecking_asr_customization/utils.py | 84 +++++++++++++++++++ 3 files changed, 86 insertions(+), 2 deletions(-) diff --git a/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py b/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py index 07d64ec5b723..68c55ff51a4f 100644 --- a/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py +++ b/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py @@ -53,7 +53,7 @@ print("Size of customization vocabulary:", len(custom_phrases)) # Load n-gram mappings vocabulary -ngram_mapping_vocab, ban_ngram = load_ngram_mappings(args.ngram_mappings, max_misspelled_freq=125000) +ngram_mapping_vocab, ban_ngram = load_ngram_mappings(args.ngram_mappings, max_misspelled_freq=args.max_misspelled_freq) # Generate index of custom phrases phrases, ngram2phrases = get_index( diff --git a/examples/nlp/spellchecking_asr_customization/run_infer.sh b/examples/nlp/spellchecking_asr_customization/run_infer.sh index 09da98171c16..b4bbdc4da375 100644 --- a/examples/nlp/spellchecking_asr_customization/run_infer.sh +++ b/examples/nlp/spellchecking_asr_customization/run_infer.sh @@ -31,7 +31,7 @@ BIG_SAMPLE=spellmapper_asr_customization_en/big_sample.txt ## File with input nemo ASR manifest INPUT_MANIFEST=spellmapper_en_evaluation/medical_manifest_ctc.json ## File containing custom words and phrases (plain text) -CUSTOM_VOCAB=spellmapper_en_evaluation/medical_custom_vocab.json +CUSTOM_VOCAB=spellmapper_en_evaluation/medical_custom_vocab.txt ## Other files will be created ## File with index of custom vocabulary diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/utils.py b/nemo/collections/nlp/data/spellchecking_asr_customization/utils.py index cda551189d78..7385f19b414a 100644 --- a/nemo/collections/nlp/data/spellchecking_asr_customization/utils.py +++ b/nemo/collections/nlp/data/spellchecking_asr_customization/utils.py @@ -764,12 +764,30 @@ def check_banned_replacements(src: str, dst: str) -> bool: # anticipated => anticipate if src.endswith("ed") and dst.endswith("e") and src[0:-2] == dst[0:-1]: return True + # blocks => blocked + if src.endswith("s") and dst.endswith("ed") and src[0:-1] == dst[0:-2]: + return True + # blocked => blocks + if src.endswith("ed") and dst.endswith("s") and src[0:-2] == dst[0:-1]: + return True + # lives => lived + if src.endswith("es") and dst.endswith("ed") and src[0:-2] == dst[0:-2]: + return True + # lived => lives + if src.endswith("ed") and dst.endswith("es") and src[0:-2] == dst[0:-2]: + return True # regarded => regard if src.endswith("ed") and src[0:-2] == dst: return True # regard => regarded if dst.endswith("ed") and dst[0:-2] == src: return True + # regardeding => regard + if src.endswith("ing") and src[0:-3] == dst: + return True + # regard => regarding + if dst.endswith("ing") and dst[0:-3] == src: + return True # longer => long if src.endswith("er") and src[0:-2] == dst: return True @@ -782,48 +800,102 @@ def check_banned_replacements(src: str, dst: str) -> bool: # discussing => discussed if src.endswith("ing") and dst.endswith("ed") and src[0:-3] == dst[0:-2]: return True + # live => living + if src.endswith("e") and dst.endswith("ing") and src[0:-1] == dst[0:-3]: + return True + # living => live + if src.endswith("ing") and dst.endswith("e") and src[0:-3] == dst[0:-1]: + return True # discussion => discussing if src.endswith("ion") and dst.endswith("ing") and src[0:-3] == dst[0:-3]: return True # discussing => discussion if src.endswith("ing") and dst.endswith("ion") and src[0:-3] == dst[0:-3]: return True + # alignment => aligning + if src.endswith("ment") and dst.endswith("ing") and src[0:-4] == dst[0:-3]: + return True + # aligning => alignment + if src.endswith("ing") and dst.endswith("ment") and src[0:-3] == dst[0:-4]: + return True # dispensers => dispensing if src.endswith("ers") and dst.endswith("ing") and src[0:-3] == dst[0:-3]: return True # dispensing => dispensers if src.endswith("ing") and dst.endswith("ers") and src[0:-3] == dst[0:-3]: return True + # integrate => integrity + if src.endswith("ate") and dst.endswith("ity") and src[0:-3] == dst[0:-3]: + return True + # integrity => integrate + if src.endswith("ity") and dst.endswith("ate") and src[0:-3] == dst[0:-3]: + return True # discussion => discussed if src.endswith("ion") and dst.endswith("ed") and src[0:-3] == dst[0:-2]: return True # discussed => discussion if src.endswith("ed") and dst.endswith("ion") and src[0:-2] == dst[0:-3]: return True + # anticipation => anticipate + if src.endswith("ion") and dst.endswith("e") and src[0:-3] == dst[0:-1]: + return True + # anticipate => anticipation + if src.endswith("e") and dst.endswith("ion") and src[0:-1] == dst[0:-3]: + return True # incremental => increment if src.endswith("ntal") and dst.endswith("nt") and src[0:-4] == dst[0:-2]: return True # increment => incremental if src.endswith("nt") and dst.endswith("ntal") and src[0:-2] == dst[0:-4]: return True + # national => nation + if src.endswith("nal") and dst.endswith("n") and src[0:-3] == dst[0:-1]: + return True + # nation => national + if src.endswith("n") and dst.endswith("nal") and src[0:-1] == dst[0:-3]: + return True + # significantly => significant + if src.endswith("ntly") and dst.endswith("nt") and src[0:-4] == dst[0:-2]: + return True + # significant => significantly + if src.endswith("nt") and dst.endswith("ntly") and src[0:-2] == dst[0:-4]: + return True # delivery => deliverer if src.endswith("ery") and dst.endswith("erer") and src[0:-3] == dst[0:-4]: return True # deliverer => delivery if src.endswith("erer") and dst.endswith("ery") and src[0:-4] == dst[0:-3]: return True + # deliver => deliverer + if src.endswith("er") and dst.endswith("erer") and src[0:-2] == dst[0:-4]: + return True + # deliverer => deliver + if src.endswith("erer") and dst.endswith("er") and src[0:-4] == dst[0:-2]: + return True # comparably => comparable if src.endswith("bly") and dst.endswith("ble") and src[0:-3] == dst[0:-3]: return True # comparable => comparably if src.endswith("ble") and dst.endswith("bly") and src[0:-3] == dst[0:-3]: return True + # comparably => comparability + if src.endswith("bly") and dst.endswith("bility") and src[0:-3] == dst[0:-6]: + return True + # comparability => comparably + if src.endswith("bility") and dst.endswith("bly") and src[0:-6] == dst[0:-3]: + return True # beautiful => beautifully if src.endswith("l") and dst.endswith("lly") and src[0:-1] == dst[0:-3]: return True # beautifully => beautiful if src.endswith("lly") and dst.endswith("l") and src[0:-3] == dst[0:-1]: return True + # active => actively + if src.endswith("e") and dst.endswith("ely") and src[0:-1] == dst[0:-3]: + return True + # actively => active + if src.endswith("ely") and dst.endswith("e") and src[0:-3] == dst[0:-1]: + return True # america => american if src.endswith("a") and dst.endswith("an") and src[0:-1] == dst[0:-2]: return True @@ -836,6 +908,18 @@ def check_banned_replacements(src: str, dst: str) -> bool: # investing => reinvesting if dst.startswith("re") and dst[2:] == src: return True + # unchanged => changed + if src.startswith("un") and src[2:] == dst: + return True + # changed => unchanged + if dst.startswith("un") and dst[2:] == src: + return True + # disrespected => respected + if src.startswith("dis") and src[3:] == dst: + return True + # respected => disrespected + if dst.startswith("dis") and dst[3:] == src: + return True # outperformance => performance if src.startswith("out") and src[3:] == dst: return True From 9051440b313ff3bba07928319a4e4840c9aa55bb Mon Sep 17 00:00:00 2001 From: Yi Dong <43824965+yidong72@users.noreply.github.com> Date: Tue, 18 Jul 2023 11:38:25 -0400 Subject: [PATCH 115/123] added back the retro documents (#7033) Signed-off-by: Yi Dong --- .../nlp/nemo_megatron/retro/retro_model.rst | 446 +++++++++++++++++- 1 file changed, 444 insertions(+), 2 deletions(-) diff --git a/docs/source/nlp/nemo_megatron/retro/retro_model.rst b/docs/source/nlp/nemo_megatron/retro/retro_model.rst index edbec3d1c2ca..ceff1baf857f 100644 --- a/docs/source/nlp/nemo_megatron/retro/retro_model.rst +++ b/docs/source/nlp/nemo_megatron/retro/retro_model.rst @@ -1,2 +1,444 @@ -Coming Soon ... -================ \ No newline at end of file +NeMo RETRO Model +================ + +The Retrieval-Enhanced Transformer (RETRO) model is an autoregressive language model that takes into account document chunks retrieved from a large +corpus when making predictions. The RETRO model has a similar architecture to the GPT model, but it includes an encoder that encodes the retrieved +context and cross-attention layers that integrate the context to improve the model's output. Below is a simple diagram of the RETRO model architecture. + +.. image:: images/arch.png + :align: center + :width: 800px + :alt: RETRO model architecture + +For more detailed information on the model, please refer to the `RETRO paper `_ :cite:`nlp-retro-borgeaud2021improving` by Deepmind. +The NeMo RETRO Model is an open-source implementation of the paper, and it has the following differences/features compared to Deepmind's proposed implementation: + +1. The NeMo RETRO Model is built on top of NeMo Megatron code, allowing for efficient training of large language models in a cluster environment. +2. The NeMo RETRO Model uses `Faiss `_ :cite:`nlp-retro-jegou2022faiss` as the K$N search library, which can be accelerated by GPUs. +3. The NeMo RETRO uses `RoPe relative positional encoding `_ :cite:`nlp-retro-su2021roformer`. +4. The NeMo RETRO uses `SentenceTransformers `_ :cite:`nlp-retro-reimers2019sentence` as the retriever encoder. +5. The NeMo RETRO supports `mu-Transfer `_ :cite:`nlp-retro-yang2022tensor`, allowing for scalable training of the RETRO model via Zero-Shot Hyperparameter Transfer. + +Quick start +************ +Steps below demonstrate training and evaluating a NeMo RETRO model + +Data pre-processing +------------------- + +Step 1: Collect training data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The RETRO model uses two types of data: training data, which typically consists of 64-token chunks, and retrieval data, which typically consists of 128-token chunks. +The training data is used to train the model, while the retrieval data is used to supplement the language model. +It's possible to use the same data for both training and retrieval, as long as duplicates are removed properly, as described below. +Both types of data are stored in a loose JSON format, with each line containing a single text sample. For example: + +.. code-block:: json + {"src": "www.nvidia.com", "text": "The quick brown fox", "type": "Eng", "id": "0", "title": "First Part"} + {"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"} +The name of the text field of the json can be changed by using the ``--json-key`` flag in ``preprocess_data_for_megatron.py``. The other metadata are optional and are not used in training. + +Step 2: Convert training data into memory map format +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The loose json is then processed into a binary format for training and retrieval. To convert the json into mmap, cached index file. +Set the ``--dataset-impl`` flag to `retmmap`, which is the memory map format dedicated for RETRO model. + +An example script to prepare data for RETRO training is: + +.. code-block:: bash + python scripts/nlp_language_modeling/preprocess_data_for_megatron.py \ + --input=/dataset/pubmed_train.jsonl \ + --json-keys=text \ + --tokenizer-library=megatron \ + --apply-ftfy \ + --dataset-impl=retmmap \ + --merge-file=/dataset/gpt2-merges.txt \ + --vocab-file=/dataset/gpt2-vocab.json \ + --tokenizer-type=GPT2BPETokenizer \ + --output-prefix=/result/pubmed_train \ + --need-pad-id \ + --append-eod \ + --retrieval-db \ + --chunk_size=64 \ + --workers=48 +The RETRO model processes chunked documents using 64 tokens as the default chunk size. The RETRO memory map dataset will add padding +tokens to the end of each document to make it a multiple of 64. The ``--need-pad-id`` argument adds a padding token to the tokenizer +if it doesn't already have one. The ``--append-eod`` argument controls whether to add ``end-of-document`` tokens to the preprocessed +data, and the ``--retrieval-db`` argument indicates whether to create a retrieval database for the preprocessed data. If ``--retrieval-db`` +is used, it will add an additional 64 padding tokens at the end of the document. The ``--chunk_size`` and ``--workers`` arguments +control the size of the data chunks to be processed and the number of worker processes to use, respectively. + +Following is the retro memory map index data format: + +.. list-table:: + :widths: 25 25 25 25 25 25 + + * - 'MMIDRET\x00\x00' (header 9 bytes) + - 1 (version 8 byte) + - dtype code :sup:`1` (1 byte) + - sentence count (8 byte) + - chunk size (8 byte) + - chunk count (8 byte) + * - retrieved db :sup:`2` (1 byte) + - number of tokens for each of sentences ( int32 array) + - start of sentence address in byte (int64 array) + - start of chunk id (int64 array) + - chunk id address in byte (int64 array) + - + +:sup:`1` 1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float, 7: np.double, 8: np.uint16 + +:sup:`2` When building the indexed dataset, we pad each sentence to be a multiple of ``chunk_size`` with ``pad_id`` from the tokenizer. +The number of tokens for each sentence includes the padded token ids. For retrieval data, there is an extra ``chunk_size`` padding at +the end of each sentence, and the ``retrieved_db`` flag is set to True. However, the number of tokens for each sentence excludes this extra ``chunk_size`` padding. + +Following is the retro memory map binary data format: + +.. list-table:: + :widths: 65 + + * - token id array for sentence 0,1, 2 ... (dtype :sup:`3` array) + +:sup:`3` np.uint16 vocab_size < 65500 else np.int32 + +Step 3: Create Faiss index for retrieval data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +After creating the memory map retrieval data binary file and index files, we can build a Faiss index that can quickly find the K-nearest neighbors of a given +chunk ID based on a query embedding vector. Because the retrieval data is typically very large, we break this process down into three steps. + +Step 3.1: Train the Faiss index structure +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this step, it uses a subset of the retrieval data to train a empty Faiss index. An example script is: + +.. code-block:: bash + python scripts/nlp_language_modeling/build_retrieval_index.py \ + --input_file=/result/pubmed_train_text_document \ + --tokenizer-library=megatron \ + --tokenizer-type=GPT2BPETokenizer \ + --merge-file=/dataset/gpt2-merges.txt \ + --vocab-file=/dataset/gpt2-vocab.json \ + --percent=1.0 \ + --sentence_transformer_model=all-mpnet-base-v2 \ + --batch_size=1024 \ + --train_index_size=2000000 \ + --workers=2 \ + --devices=0,1,2,3,4,5,6,7 \ + --stage=0 \ + --output_file=/result/pubmed_faiss_learn.index +This command is used to build an empty Faiss index using the 2000000 training data in ``pubmed_train_text_document``. +The ``all-mpnet-base-v2`` sentence transformer model is used to encode the chunk tokens into an embedding vector. +The index will be saved in the result directory as ``pubmed_faiss_learn.index``. This command specifies using 8 GPUs to train the Faiss index. + +Step 3.2: Add retrieval data into sharding index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This step adds all the retrieval data to the empty Faiss index created in the previous step. An example script is: + +.. code-block:: bash + python scripts/nlp_language_modeling/build_retrieval_index.py \ + --input_file=/result/pubmed_train_text_document \ + --tokenizer-library=megatron \ + --tokenizer-type=GPT2BPETokenizer \ + --merge-file=/dataset/gpt2-merges.txt \ + --vocab-file=/dataset/gpt2-vocab.json \ + --percent=1.0 \ + --sentence_transformer_model=all-mpnet-base-v2 \ + --batch_size=1024 \ + --shard_id=0 \ + --total_shards=10 \ + --workers=2 \ + --devices=0,1,2,3,4,5,6,7 \ + --stage=1 \ + --learned_index=/result/pubmed_faiss_learn.index \ + --output_file=/result/pubmed_faiss_shard0.save +This command breaks the retrieval data into ``total_shards`` shards and adds the data in the shard specified by ``shard_id``. +The result is saved to a file specified by ``output_file``. In the example above, 10 sharding indexes are created. + +Step 3.3: Merge the sharding indexes into final Faiss index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This step merges all the sharding indexes created in the previous step into the final Faiss index. An example script is: + +.. code-block:: bash + python scripts/nlp_language_modeling/build_retrieval_index.py \ + --stage=2 \ + --devices=0,1,2,3,4,5,6,7 \ + --learned_index=/result/pubmed_faiss_learn.index \ + --shard_index_input=/result/pubmed_faiss_shard \ + --output_file=/result/pubmed_faiss_final.index +Step 4: Build KNN index +^^^^^^^^^^^^^^^^^^^^^^^ + +During training, it is inefficient to run a query to find the K-nearest neighbor chunk IDs for each training data point. +This can be pre-calculated by building a KNN index before training. The KNN index maps the training data chunk IDs to the K-nearest neighbor chunk IDs +in the retrieval data. As with building the Faiss index, this process is divided into two steps. + +Following is the KNN index data format: + +.. list-table:: + :widths: 25 25 25 25 45 + + * - 'KNNRETM\x00\x00' (header 9 bytes) + - 1 (version 8 byte) + - K number of neighbors (8 byte) + - Number chunks (8 byte) + - Map to K retrieval data chunk IDs, shape (number_chunks, K) ( int64 array) + +Step 4.1: Build KNN sharding index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The KNN index is built using the memory-mapped training data created by the ``preprocess_data_for_megatron.py`` script and the Faiss index +file for the retrieval data built by the ``build_retrieval_index.py`` script. + +An example script is: + +.. code-block:: bash + python scripts/nlp_language_modeling/build_knn_map_index.py \ + --input_file=/result/pubmed_eval_text_document \ + --tokenizer-library=megatron \ + --tokenizer-type=GPT2BPETokenizer \ + --merge-file=/dataset/gpt2-merges.txt \ + --vocab-file=/dataset/gpt2-vocab.json \ + --process_chunk_size=10000 \ + --sentence_transformer_model=all-mpnet-base-v2 \ + --batch_size=1024 \ + --K_neighbors=50 \ + --workers=2 \ + --devices=0,1,2,3,4,5,6,7 \ + --remove_duplicate \ + --dedup_margin=70 \ + --nprobe=100 \ + --shard_id=0 \ + --total_shards=10 \ + --stage=1 \ + --output_file=/dataset/pubmed_knn_shard0.save \ + --faiss_index=/result/pubmed_faiss_final.index +In this example, the training data is broken into ``total_shards`` shards, and the KNN index is calculated for the shard specified by ``shard_id``. +The result is saved to a file specified by ``output_file``. In the example above, 10 KNN sharding indexes are created. + +Use the ``remove_duplicate`` flag if the training data and retrieval data are the same to remove neighbors from the same document. + +Step 4.2: Merge KNN sharding index into final KNN index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +An example script is: + +.. code-block:: bash + python scripts/nlp_language_modeling/build_knn_map_index.py \ + --stage=2 \ + --output_file=pubmed_knn_final.save \ + --shard_index_input=pubmed_knn_shard +Train NeMo RETRO Model +----------------------- + +Once the training data, retrieval data, KNN index, and Faiss index are prepared, we are ready to train the RETRO model. In the NeMo implementation, +the RETRO model can be pre-trained with or without the `mu-Transfer `_ :cite:`nlp-retro-yang2022tensor` feature. We will introduce both ways. + + +The table below lists some of the common parameters that can be configured for model pre-training. + ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| **Parameter** | **Default** | **Description** | ++==================================+=============+========================================================================================+ +| model.micro_batch_size | 4 | the micro batch size used for training | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.tensor_model_parallel_size | 1 | tensor model parallel size | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.encoder_seq_length | 2048 | token sequence length | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.chunk_size | 64 | the chunk size used to retrieve | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.enc_num_layers | 4 | total number of encoder layers | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.dec_num_layers | 6 | total number of decoder layers | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.enc_cross_attention | [3] | layer numbers for cross attention in encoder | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.dec_cross_attention | [3,4,5] | layer numbers for chunked cross attention in decoder | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.add_position_embedding | FALSE | whether to add the absolute position encoding | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.hidden_size | 768 | model hidden size | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.ffn_hidden_size | 3072 | model FFN hidden size. Usually 4 * hidden_size | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.num_attention_heads | 12 | number of attention heads | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.init_method_std | 0.02 | standard deviation of the zero mean normal distribution used for weight initialization | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.hidden_dropout | 0.1 | dropout probability for hidden state transformer | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.attention_dropout | 0.1 | dropout probability in the attention layer | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ +| model.ffn_dropout | 0 | dropout probability in the feed-forward layer | ++----------------------------------+-------------+----------------------------------------------------------------------------------------+ + + +Option 1: Train the NeMo RETRO model *without* mu-Transfer +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +An example RETRO pre-training script is: + +.. code-block:: bash + python examples/nlp/language_modeling/megatron_retro_pretraining.py \ + trainer.devices=8 \ + trainer.num_nodes=2 \ + trainer.accelerator=gpu \ + trainer.max_steps=800000 \ + trainer.precision=16 \ + exp_manager.exp_dir=/result/retro_model \ + model.apply_query_key_layer_scaling=False \ + model.tensor_model_parallel_size=8 \ + model.optim.name=adamw \ + model.enc_num_layers=2 \ + model.dec_num_layers=32 \ + model.enc_cross_attention=[0] \ + model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \ + model.hidden_size=4096 \ + model.ffn_hidden_size=16384 \ + model.num_attention_heads=32 \ + model.tokenizer.merge_file=/dataset/gpt2-merges.txt \ + model.tokenizer.vocab_file=/dataset/gpt2-vocab.json \ + model.data.data_prefix=[/result/pubmed_eval_text_document] \ + model.data.knn_index=[dataset/pubmed_knn_final.save] \ + model.data.retrieval_prefix=/result/pubmed_eval_text_document \ + model.micro_batch_size=8 +During the training, launch Tensorboard to monitor training like so: + +.. code-block:: bash + tensorboard --logdir /result/retro_model --bind_all +.. note:: Weights and Biases (WandB) is supported too. Add ``exp_manager.create_wandb_logger=True`` to the model training arguments to enable it. + +After the training, the model nemo file can be found at the result checkpoint directory. + +Option 2: Train the NeMo RETRO model *with* mu-Transfer +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +`mu-Transfer `_ :cite:`nlp-retro-yang2022tensor` paper proposed a method to zero-shot transfer hyperparameter to train a larger model. +This can be done in 3 steps in NeMo RETRO implementation. + + +Step 1. find optimal hyper parameter for a small base model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use the pre-training code in Option 1, either manually or automatically ind a set of optimal hyperparameter for a small base RETRO +model. This is can be done cheaply ans fast due to the small model size. + +Step 2. calculate the shape file that can be used to run mu-Transfer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The shape file determines which hyperparameters will be scaled up, allowing the model to adjust the learning rate, weight scaling factor, etc. + +Here is an example shape file calculation script: + + +.. code-block:: bash + python examples/nlp/language_modeling/megatron_retro_cal_shape.py \ + trainer.devices=8 \ + trainer.num_nodes=1 \ + trainer.accelerator=gpu \ + exp_manager.exp_dir=/result/retro_model \ + base_model.enc_num_layers=2 \ + delta_model.enc_num_layers=2 \ + base_model.dec_num_layers=32 \ + delta_model.dec_num_layers=32 \ + base_model.tensor_model_parallel_size=8 \ + delta_model.tensor_model_parallel_size=8 \ + base_model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \ + delta_model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \ + base_model.enc_cross_attention=[0] \ + delta_model.enc_cross_attention=[0] \ + base_model.hidden_size=768 \ + base_model.ffn_hidden_size=3072 \ + delta_model.hidden_size=96 \ + delta_model.ffn_hidden_size=384 \ + base_model.num_attention_heads=16 \ + delta_model.num_attention_heads=16 \ + model.shape_file=tp8_32depth_o1_rel_shape_info.yaml +In this example, the ``base_model`` refers to the small base model for which an optimal set of hyperparameters has been determined. +The ``delta_model`` refers to a model with certain hyperparameters that have been scaled up or down. In this case, +the ``hidden_size`` and ``ffn_hidden_size`` have been changed in the ``delta_model``, allowing these two parameters to be scaled freely later. + +Step 3. Pretrain mu-Transfer RETRO model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Once the shape file is created, we can start training a RETRO model. The model training can be scale up freely using the hyperparameters +specified by the delta model and the shape file. + +An example mu-Transfer pre-training script is: + +.. code-block:: bash + python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \ + trainer.devices=8 \ + trainer.num_nodes=2 \ + trainer.accelerator=gpu \ + trainer.max_steps=500000 \ + trainer.precision=16 \ + exp_manager.exp_dir=/result/retro_model \ + model.apply_query_key_layer_scaling=False \ + model.tensor_model_parallel_size=8 \ + model.optim.name=muadamw \ + model.enc_num_layers=2 \ + model.dec_num_layers=32 \ + model.enc_cross_attention=[0] \ + model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \ + model.hidden_size=4096 \ + model.ffn_hidden_size=16384 \ + model.num_attention_heads=32 \ + model.tokenizer.merge_file=/dataset/gpt2-merges.txt \ + model.tokenizer.vocab_file=/dataset/gpt2-vocab.json \ + model.data.data_prefix=[/result/pubmed_eval_text_document] \ + model.data.knn_index=[dataset/pubmed_knn_final.save] \ + model.data.retrieval_prefix=/result/pubmed_eval_text_document \ + model.micro_batch_size=8 \ + model.shape_file=tp8_32depth_o1_rel_shape_info.yaml +.. note:: We have chosen to use ``muadamw`` as the optimizer for use with the mu-transfer method. Currently, only ``muadam`` and ``muadamw`` are supported. + +Similarly to the pre-training in Option 1, the model nemo file can be found at the result checkpoint directory after training is complete. + +Run NeMo RETRO Model Inference +------------------------------- + +Once the NeMo RETRO model has been trained, we can put it into inference mode and experiment with it. +During inference, we are not limited to the static Faiss index that we built earlier for KNN queries. +We can feed any external data to the model as retrieval context. NeMo RETRO implementation supports dynamic retrieval service, +allowing users to add, reset, and query new documents on the fly. + +We have built a simple web client that makes it easy for users to play around with the model. Here is an example script to launch the server: + +.. code-block:: bash + python examples/nlp/language_modeling/megatron_retro_eval.py \ + trainer.devices=8 \ + trainer.num_nodes=1 \ + trainer.accelerator=gpu \ + trainer.precision=16 \ + retro_model_file=megatron_retro.nemo \ + tensor_model_parallel_size=8 \ + pipeline_model_parallel_size=1 \ + retrieval_service.sentence_bert.devices=\'0,1,2,3,4,5,6,7\' \ + retrieval_service.services.0.faiss_devices=\'0,1,2,3,4,5,6,7\' \ + retrieval_service.services.1.faiss_devices=\'0,1,2,3,4,5,6,7\' \ + retrieval_service.services.0.faiss_index=/result/pubmed_faiss_final.index \ + retrieval_service.services.0.retrieval_index=/result/pubmed_eval_text_document \ + retrieval_service.neighbors=2 \ + retrieval_service.pad_tokens=True \ + retrieval_service.store_retrieved=True \ + server=True \ + web_server=True \ + share=True \ + username=test \ + password=test123 +Set the retro_model_file to use the nemo file generated in the pre-training step. After launching the server, copy-paste the URL from +the terminal into your browser. Use the specified username and password to log in and have fun experimenting with the RETRO model. + +References +************ + +.. bibliography:: ../../nlp_all.bib + :style: plain + :labelprefix: nlp-retro + :keyprefix: nlp-retro- From 84ae944f4a9af5612389e26a1a15e63368737abc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 18 Jul 2023 09:26:28 -0700 Subject: [PATCH 116/123] Remove pyyaml (#7052) (#7054) Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar --- requirements/requirements_lightning.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt index 100216aebc54..9c41c355e8cd 100644 --- a/requirements/requirements_lightning.txt +++ b/requirements/requirements_lightning.txt @@ -1,7 +1,6 @@ hydra-core>=1.2.0,<1.3 omegaconf>=2.2,<2.3 pytorch-lightning>=1.9.0,<=1.9.4 -pyyaml<6 # Pinned until omegaconf works with pyyaml>=6 torchmetrics>=0.11.0 transformers>=4.0.1 wandb From 47e782a7fee12e0e0109004c6b395ca71524f16f Mon Sep 17 00:00:00 2001 From: "Aleksey Grinchuk (Oleksii Hrinchuk)" Date: Tue, 18 Jul 2023 12:17:41 -0700 Subject: [PATCH 117/123] st standalone model (#6969) * st standalone model Signed-off-by: AlexGrinch * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * style fix Signed-off-by: AlexGrinch * sacrebleu import fix, unused imports removed Signed-off-by: AlexGrinch * import guard for nlp inside asr transformer bpe model Signed-off-by: AlexGrinch * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * codeql fixes Signed-off-by: AlexGrinch * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * comments answered Signed-off-by: AlexGrinch * import ordering fix Signed-off-by: AlexGrinch * yttm for asr removed Signed-off-by: AlexGrinch * logging added Signed-off-by: AlexGrinch * added inference and translate method Signed-off-by: AlexGrinch * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: AlexGrinch Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../fast-conformer_transformer.yaml | 218 +++++++ .../speech_to_text_transformer.py | 70 ++ .../speech_translation/translate_speech.py | 210 ++++++ nemo/collections/asr/models/__init__.py | 1 + .../asr/models/transformer_bpe_models.py | 614 ++++++++++++++++++ .../tokenizers/sentencepiece_tokenizer.py | 2 +- 6 files changed, 1114 insertions(+), 1 deletion(-) create mode 100644 examples/asr/conf/speech_translation/fast-conformer_transformer.yaml create mode 100644 examples/asr/speech_translation/speech_to_text_transformer.py create mode 100644 examples/asr/speech_translation/translate_speech.py create mode 100644 nemo/collections/asr/models/transformer_bpe_models.py diff --git a/examples/asr/conf/speech_translation/fast-conformer_transformer.yaml b/examples/asr/conf/speech_translation/fast-conformer_transformer.yaml new file mode 100644 index 000000000000..4e480df62e59 --- /dev/null +++ b/examples/asr/conf/speech_translation/fast-conformer_transformer.yaml @@ -0,0 +1,218 @@ +# It contains the default values for training an autoregressive FastConformer-Transformer ST model with sub-word encoding. + +# Architecture and training config: +# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective +# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. +# Here are the recommended configs for different variants of FastConformer-Transformer, other parameters are the same as in this config file. +# One extra (linear projection) layer is added between FastConformer encoder and Transformer decoder if they have different hidden sizes +# It is recommended to initialize FastConformer with ASR pre-trained encoder for better accuracy and faster convergence + +name: "FastConformer-Transformer-BPE-st" + +# Initialize model encoder with pre-trained ASR FastConformer encoder for faster convergence and improved accuracy +init_from_nemo_model: + model0: + path: ??? + include: ["preprocessor", "encoder"] + +model: + sample_rate: 16000 + label_smoothing: 0.0 + log_prediction: true # enables logging sample predictions in the output during training + + train_ds: + is_tarred: true + tarred_audio_filepaths: ??? + manifest_filepath: ??? + sample_rate: 16000 + shuffle: false + trim_silence: false + batch_size: 4 + num_workers: 8 + + validation_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: false + num_workers: 4 + pin_memory: true + use_start_end_token: true + + test_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: false + num_workers: 4 + pin_memory: true + use_start_end_token: true + + # recommend small vocab size of 128 or 256 when using 4x sub-sampling + # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "per_feature" + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + log: true + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + pad_value: 0.0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + # you may use lower time_masks for smaller models to have a faster convergence + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + + # Sub-sampling params + subsampling: dw_striding # vggnet or striding, vggnet may give better results but needs more memory + subsampling_factor: 8 # must be power of 2 + subsampling_conv_channels: 256 # -1 sets it to d_model + causal_downsampling: false + reduction: null + reduction_position: null + reduction_factor: 1 + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + att_context_size: [-1, -1] # -1 means unlimited context + xscaling: true # scales up the input embeddings by sqrt(d_model) + untie_biases: true # unties the biases of the TransformerXL layers + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: batch_norm + conv_context_size: null + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + transf_encoder: + num_layers: 0 + hidden_size: 512 + inner_size: 2048 + num_attention_heads: 8 + ffn_dropout: 0.1 + attn_score_dropout: 0.1 + attn_layer_dropout: 0.1 + + transf_decoder: + library: nemo + model_name: null + pretrained: false + max_sequence_length: 512 + num_token_types: 0 + embedding_dropout: 0.1 + learn_positional_encodings: false + hidden_size: 512 + inner_size: 2048 + num_layers: 6 + num_attention_heads: 4 + ffn_dropout: 0.1 + attn_score_dropout: 0.1 + attn_layer_dropout: 0.1 + hidden_act: relu + pre_ln: true + pre_ln_final_layer_norm: true + + head: + num_layers: 1 + activation: relu + log_softmax: true + dropout: 0.0 + use_transformer_init: true + + beam_search: + beam_size: 4 + len_pen: 0.0 + max_generation_delta: 50 + + optim: + name: adam + lr: 0.0001 + # optimizer arguments + betas: [0.9, 0.98] + # less necessity for weight_decay as we already have large augmentations with SpecAug + # you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used + # weight decay of 0.0 with lr of 2.0 also works fine + #weight_decay: 1e-3 + + # scheduler setup + sched: + name: InverseSquareRootAnnealing + #d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 1000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + gpus: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: 100 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 0.0 + precision: 16 # Should be set to 16 for O1 and O2 to enable the AMP. + log_every_n_steps: 100 # Interval of logging. + enable_progress_bar: True + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_sacreBLEU" + mode: "max" + save_top_k: 3 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + + # you need to set these two to True to continue the training + resume_if_exists: false + resume_ignore_no_checkpoint: false + + # You may use this section to create a W&B logger + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null \ No newline at end of file diff --git a/examples/asr/speech_translation/speech_to_text_transformer.py b/examples/asr/speech_translation/speech_to_text_transformer.py new file mode 100644 index 000000000000..0c0882859b88 --- /dev/null +++ b/examples/asr/speech_translation/speech_to_text_transformer.py @@ -0,0 +1,70 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +# Training the model +```sh +python speech_to_text_transformer.py \ + # (Optional: --config-path= --config-name=) \ + model.train_ds.audio.tarred_audio_filepaths= \ + model.train_ds.audio_manifest_filepath= \ + model.validation_ds.manifest_filepath= \ + model.test_ds.manifest_filepath= \ + model.tokenizer.dir= \ + model.tokenizer.model_path= \ + model.tokenizer.type= \ + trainer.gpus=-1 \ + trainer.accelerator="ddp" \ + trainer.max_epochs=100 \ + model.optim.name="adamw" \ + model.optim.lr=0.001 \ + model.optim.betas=[0.9,0.999] \ + model.optim.weight_decay=0.0001 \ + model.optim.sched.warmup_steps=2000 + exp_manager.create_wandb_logger=True \ + exp_manager.wandb_logger_kwargs.name="" \ + exp_manager.wandb_logger_kwargs.project="" +``` + + +""" + +import pytorch_lightning as pl +from omegaconf import OmegaConf + +from nemo.collections.asr.models import EncDecTransfModelBPE +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager + + +@hydra_runner(config_path="../conf/speech_translation/", config_name="fast-conformer_transformer") +def main(cfg): + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + trainer = pl.Trainer(**cfg.trainer) + exp_manager(trainer, cfg.get("exp_manager", None)) + asr_model = EncDecTransfModelBPE(cfg=cfg.model, trainer=trainer) + + # Initialize the weights of the model from another model, if provided via config + asr_model.maybe_init_from_pretrained_checkpoint(cfg) + trainer.fit(asr_model) + + if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: + if asr_model.prepare_test(trainer): + trainer.test(asr_model) + + +if __name__ == '__main__': + main() diff --git a/examples/asr/speech_translation/translate_speech.py b/examples/asr/speech_translation/translate_speech.py new file mode 100644 index 000000000000..203852b52ee9 --- /dev/null +++ b/examples/asr/speech_translation/translate_speech.py @@ -0,0 +1,210 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import json +import os +from dataclasses import dataclass, is_dataclass +from typing import List, Optional, Union + +import pytorch_lightning as pl +import torch +from omegaconf import OmegaConf + +from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig +from nemo.collections.asr.parts.utils.transcribe_utils import compute_output_filename, prepare_audio_data, setup_model +from nemo.core.config import hydra_runner +from nemo.utils import logging + +""" +Translate audio file on a single CPU/GPU. Useful for translations of moderate amounts of audio data. + +# Arguments + model_path: path to .nemo ST checkpoint + pretrained_name: name of pretrained ST model (from NGC registry) + audio_dir: path to directory with audio files + dataset_manifest: path to dataset JSON manifest file (in NeMo format) + + output_filename: Output filename where the translations will be written + batch_size: batch size during inference + + cuda: Optional int to enable or disable execution of model on certain CUDA device. + allow_mps: Bool to allow using MPS (Apple Silicon M-series GPU) device if available + amp: Bool to decide if Automatic Mixed Precision should be used during inference + audio_type: Str filetype of the audio. Supported = wav, flac, mp3 + + overwrite_translations: Bool which when set allows repeated translations to overwrite previous results. + +# Usage +ST model can be specified by either "model_path" or "pretrained_name". +Data for translation can be defined with either "audio_dir" or "dataset_manifest". +Results are returned in a JSON manifest file. + +python translate_speech.py \ + model_path=null \ + pretrained_name=null \ + audio_dir="" \ + dataset_manifest="" \ + output_filename="" \ + batch_size=32 \ + cuda=0 \ + amp=True \ +""" + + +@dataclass +class ModelChangeConfig: + + # Sub-config for changes specific to the Conformer Encoder + conformer: ConformerChangeConfig = ConformerChangeConfig() + + +@dataclass +class TranslationConfig: + # Required configs + model_path: Optional[str] = None # Path to a .nemo file + pretrained_name: Optional[str] = None # Name of a pretrained model + audio_dir: Optional[str] = None # Path to a directory which contains audio files + dataset_manifest: Optional[str] = None # Path to dataset's JSON manifest + audio_key: str = 'audio_filepath' # Used to override the default audio key in dataset_manifest + eval_config_yaml: Optional[str] = None # Path to a yaml file of config of evaluation + + # General configs + output_filename: Optional[str] = None + batch_size: int = 32 + random_seed: Optional[int] = None # seed number going to be used in seed_everything() + + # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA + # device anyway, and do inference on CPU only if CUDA device is not found. + # If `cuda` is a negative number, inference will be on CPU only. + cuda: Optional[int] = None + allow_mps: bool = False # allow to select MPS device (Apple Silicon M-series GPU) + amp: bool = False + audio_type: str = "wav" + + # Recompute model translation, even if the output folder exists with scores. + overwrite_translations: bool = True + + # can be set to True to return list of translations instead of the config + # if True, will also skip writing anything to the output file + return_translations: bool = False + + +@hydra_runner(config_name="TranslationConfig", schema=TranslationConfig) +def main(cfg: TranslationConfig) -> Union[TranslationConfig, List[str]]: + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + for key in cfg: + cfg[key] = None if cfg[key] == 'None' else cfg[key] + + if is_dataclass(cfg): + cfg = OmegaConf.structured(cfg) + + if cfg.random_seed: + pl.seed_everything(cfg.random_seed) + + if cfg.model_path is None and cfg.pretrained_name is None: + raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None!") + if cfg.audio_dir is None and cfg.dataset_manifest is None: + raise ValueError("Both cfg.audio_dir and cfg.dataset_manifest cannot be None!") + + # Load augmentor from exteranl yaml file which contains eval info, could be extend to other feature such VAD, P&C + augmentor = None + if cfg.eval_config_yaml: + eval_config = OmegaConf.load(cfg.eval_config_yaml) + augmentor = eval_config.test_ds.get("augmentor") + logging.info(f"Will apply on-the-fly augmentation on samples during translation: {augmentor} ") + + # setup GPU + if cfg.cuda is None: + if torch.cuda.is_available(): + device = [0] # use 0th CUDA device + accelerator = 'gpu' + map_location = torch.device('cuda:0') + elif cfg.allow_mps and hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + logging.warning( + "MPS device (Apple Silicon M-series GPU) support is experimental." + " Env variable `PYTORCH_ENABLE_MPS_FALLBACK=1` should be set in most cases to avoid failures." + ) + device = [0] + accelerator = 'mps' + map_location = torch.device('mps') + else: + device = 1 + accelerator = 'cpu' + map_location = torch.device('cpu') + else: + device = [cfg.cuda] + accelerator = 'gpu' + map_location = torch.device(f'cuda:{cfg.cuda}') + + logging.info(f"Inference will be done on device: {map_location}") + + asr_model, model_name = setup_model(cfg, map_location) + trainer = pl.Trainer(devices=device, accelerator=accelerator) + asr_model.set_trainer(trainer) + asr_model = asr_model.eval() + + # collect additional translation information + return_hypotheses = False + + # prepare audio filepaths and decide wether it's partial audio + filepaths, partial_audio = prepare_audio_data(cfg) + + # setup AMP (optional) + if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'): + logging.info("AMP enabled!\n") + autocast = torch.cuda.amp.autocast + else: + + @contextlib.contextmanager + def autocast(): + yield + + # Compute output filename + cfg = compute_output_filename(cfg, model_name) + + # if translations should not be overwritten, and already exists, skip re-translation step and return + if not cfg.return_translations and not cfg.overwrite_translations and os.path.exists(cfg.output_filename): + logging.info( + f"Previous translations found at {cfg.output_filename}, and flag `overwrite_translations`" + f"is {cfg.overwrite_translations}. Returning without re-translating text." + ) + return cfg + + # translate audio + with autocast(): + with torch.no_grad(): + translations = asr_model.translate( + paths2audio_files=filepaths, batch_size=cfg.batch_size, return_hypotheses=return_hypotheses, + ) + + logging.info(f"Finished translating {len(filepaths)} files !") + logging.info(f"Writing translations into file: {cfg.output_filename}") + + if cfg.return_translations: + return translations + + # write audio translations + with open(cfg.output_filename, 'w', encoding='utf-8', newline='\n') as f: + for filepath, translation in zip(filepaths, translations): + item = {'audio_filepath': filepath, 'pred_translation': translation} + f.write(json.dumps(item, ensure_ascii=False) + "\n") + logging.info(f"Finished writing predictions to {cfg.output_filename}!") + + return cfg + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter diff --git a/nemo/collections/asr/models/__init__.py b/nemo/collections/asr/models/__init__.py index a7275faea3d0..34f2c4f62e29 100644 --- a/nemo/collections/asr/models/__init__.py +++ b/nemo/collections/asr/models/__init__.py @@ -33,3 +33,4 @@ from nemo.collections.asr.models.rnnt_models import EncDecRNNTModel from nemo.collections.asr.models.slu_models import SLUIntentSlotBPEModel from nemo.collections.asr.models.ssl_models import SpeechEncDecSelfSupervisedModel +from nemo.collections.asr.models.transformer_bpe_models import EncDecTransfModelBPE diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py new file mode 100644 index 000000000000..178746795ae8 --- /dev/null +++ b/nemo/collections/asr/models/transformer_bpe_models.py @@ -0,0 +1,614 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import json +import os +import tempfile +from math import ceil +from typing import Dict, List, Optional, Union + +import editdistance +import torch +import torch.distributed as dist +from omegaconf import DictConfig, OmegaConf +from pytorch_lightning import Trainer +from tqdm.auto import tqdm + +from nemo.collections.asr.data import audio_to_text_dataset +from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs +from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel +from nemo.collections.asr.parts.mixins import ASRBPEMixin +from nemo.collections.common.losses import SmoothedCrossEntropyLoss +from nemo.collections.common.metrics import GlobalAverageLossMetric +from nemo.collections.common.parts import transformer_weights_init + +from nemo.core.classes.common import typecheck +from nemo.core.neural_types import ( + AudioSignal, + ChannelType, + LabelsType, + LengthsType, + LogprobsType, + MaskType, + NeuralType, + SpectrogramType, +) +from nemo.utils import logging + +try: + from sacrebleu import corpus_bleu + from nemo.collections.nlp.modules.common import TokenClassifier + from nemo.collections.nlp.modules.common.lm_utils import get_transformer + from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder + + NLP_AVAILABLE = True +except (ImportError, ModuleNotFoundError): + NLP_AVAILABLE = False + logging.warning("Could not import NeMo NLP collection which is required for speech translation model.") + +__all__ = ['EncDecTransfModelBPE'] + + +def lens_to_mask(lens, max_length): + batch_size = lens.shape[0] + mask = torch.arange(max_length).repeat(batch_size, 1).to(lens.device) < lens[:, None] + return mask + + +class EncDecTransfModelBPE(ASRModel, ExportableEncDecModel, ASRBPEMixin): + """Base class for encoder decoder CTC-based models.""" + + def __init__(self, cfg: DictConfig, trainer: Trainer = None): + + if 'tokenizer' not in cfg: + raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !") + + # Setup the tokenizer + self._setup_tokenizer(cfg.tokenizer) + + super().__init__(cfg=cfg, trainer=trainer) + + # Setup audio preprocessor + self.preprocessor = EncDecTransfModelBPE.from_config_dict(self.cfg.preprocessor) + + # Setup audio encoder + self.encoder = EncDecTransfModelBPE.from_config_dict(self.cfg.encoder) + + # Add projection layer if encoder and decoder differ in hidden size + if self.cfg.encoder['d_model'] != self.cfg.transf_decoder['hidden_size']: + self.adapter = torch.nn.Linear(self.cfg.encoder['d_model'], self.cfg.transf_decoder['hidden_size']) + else: + self.adapter = torch.nn.Identity() + + transf_encoder_cfg_dict = OmegaConf.to_container(cfg.get('transf_encoder')) + + # Whether to add Transformer Encoder block between Conformer and Transformer Decoder + self.use_transf_encoder = False + if transf_encoder_cfg_dict['num_layers'] > 0: + self.use_transf_encoder = True + + self.transf_encoder = TransformerEncoder( + num_layers=transf_encoder_cfg_dict['num_layers'], + hidden_size=transf_encoder_cfg_dict['hidden_size'], + inner_size=transf_encoder_cfg_dict['inner_size'], + mask_future=False, + num_attention_heads=transf_encoder_cfg_dict['num_attention_heads'], + attn_score_dropout=transf_encoder_cfg_dict['attn_score_dropout'], + attn_layer_dropout=transf_encoder_cfg_dict['attn_layer_dropout'], + ffn_dropout=transf_encoder_cfg_dict['ffn_dropout'], + pre_ln=transf_encoder_cfg_dict.get('pre_ln', True), + pre_ln_final_layer_norm=transf_encoder_cfg_dict.get('pre_ln_final_layer_norm', True), + ) + std_init_range = 1 / transf_encoder_cfg_dict['hidden_size'] ** 0.5 + self.transf_encoder.apply(lambda module: transformer_weights_init(module, std_init_range)) + + transf_decoder_cfg_dict = OmegaConf.to_container(cfg.get('transf_decoder')) + + # Transformer decoder + vocab_size = 8 * ceil(self.tokenizer.vocab_size / 8) + transf_decoder_cfg_dict['vocab_size'] = vocab_size + library = transf_decoder_cfg_dict.pop('library', 'nemo') + model_name = transf_decoder_cfg_dict.pop('model_name', None) + pretrained = transf_decoder_cfg_dict.pop('pretrained', False) + self.transf_decoder = get_transformer( + library=library, + model_name=model_name, + pretrained=pretrained, + config_dict=transf_decoder_cfg_dict, + encoder=False, + pre_ln_final_layer_norm=transf_decoder_cfg_dict.get("pre_ln_final_layer_norm", False), + ) + + self.log_softmax = TokenClassifier( + hidden_size=self.transf_decoder.hidden_size, + num_classes=vocab_size, + activation=self.cfg.head.activation, + log_softmax=self.cfg.head.log_softmax, + dropout=self.cfg.head.dropout, + use_transformer_init=self.cfg.head.use_transformer_init, + ) + self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight + std_init_range = 1 / self.transf_decoder.hidden_size ** 0.5 + self.transf_decoder.apply(lambda module: transformer_weights_init(module, std_init_range)) + self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range)) + + # Beam Search decoding + self.beam_search = BeamSearchSequenceGenerator( + embedding=self.transf_decoder.embedding, + decoder=self.transf_decoder.decoder, + log_softmax=self.log_softmax, + max_sequence_length=self.transf_decoder.max_sequence_length, + beam_size=self.cfg.beam_search.beam_size, + bos=self.tokenizer.bos_id, + pad=self.tokenizer.pad_id, + eos=self.tokenizer.eos_id, + len_pen=self.cfg.beam_search.len_pen, + max_delta_length=self.cfg.beam_search.max_generation_delta, + ) + + # Define autoregressive CE loss + self.transf_loss = SmoothedCrossEntropyLoss( + pad_id=self.tokenizer.pad_id, label_smoothing=self.cfg.label_smoothing + ) + + if hasattr(self.cfg, 'spec_augment') and self.cfg.spec_augment is not None: + self.spec_augmentation = EncDecTransfModelBPE.from_config_dict(self.cfg.spec_augment) + else: + self.spec_augmentation = None + + self.val_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True) + + @torch.no_grad() + def translate( + self, + paths2audio_files: List[str], + batch_size: int = 4, + logprobs: bool = False, + return_hypotheses: bool = False, + ) -> List[str]: + hypotheses = self.transcribe(paths2audio_files, batch_size, logprobs, return_hypotheses) + return hypotheses + + @torch.no_grad() + def transcribe( + self, + paths2audio_files: List[str], + batch_size: int = 4, + logprobs: bool = False, + return_hypotheses: bool = False, + ) -> List[str]: + """ + Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. + Args: + paths2audio_files: (a list) of paths to audio files. \ + Recommended length per file is between 5 and 25 seconds. \ + But it is possible to pass a few hours long file if enough GPU memory is available. + batch_size: (int) batch size to use during inference. + Bigger will result in better throughput performance but would use more memory. + logprobs: (bool) pass True to get log probabilities instead of transcripts. + return_hypotheses: (bool) Either return hypotheses or text + With hypotheses can do some postprocessing like getting timestamp or rescoring + Returns: + A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files + """ + if paths2audio_files is None or len(paths2audio_files) == 0: + return {} + + if return_hypotheses and logprobs: + raise ValueError( + "Either `return_hypotheses` or `logprobs` can be True at any given time." + "Returned hypotheses will contain the logprobs." + ) + + # We will store transcriptions here + hypotheses = [] + + # Model's mode and device + mode = self.training + device = next(self.parameters()).device + dither_value = self.preprocessor.featurizer.dither + pad_to_value = self.preprocessor.featurizer.pad_to + + try: + self.preprocessor.featurizer.dither = 0.0 + self.preprocessor.featurizer.pad_to = 0 + # Switch model to evaluation mode + self.eval() + # Freeze the encoder and decoder modules + self.encoder.freeze() + self.transf_decoder.freeze() + logging_level = logging.get_verbosity() + logging.set_verbosity(logging.WARNING) + # Work in tmp directory - will store manifest file there + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp: + for audio_file in paths2audio_files: + entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': 'nothing'} + fp.write(json.dumps(entry) + '\n') + + config = {'paths2audio_files': paths2audio_files, 'batch_size': batch_size, 'temp_dir': tmpdir} + + temporary_datalayer = self._setup_transcribe_dataloader(config) + for test_batch in tqdm(temporary_datalayer, desc="Transcribing"): + log_probs, encoded_len, enc_states, enc_mask = self.forward( + input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) + ) + + beam_hypotheses = ( + self.beam_search( + encoder_hidden_states=enc_states, encoder_input_mask=enc_mask, return_beam_scores=False + ) + .detach() + .cpu() + .numpy() + ) + beam_hypotheses = [self.tokenizer.ids_to_text(hyp) for hyp in beam_hypotheses] + + if return_hypotheses: + # dump log probs per file + for idx in range(logits.shape[0]): + current_hypotheses[idx].y_sequence = logits[idx][: logits_len[idx]] + + hypotheses += beam_hypotheses + + del test_batch, log_probs, encoded_len, enc_states, enc_mask + finally: + # set mode back to its original value + self.train(mode=mode) + self.preprocessor.featurizer.dither = dither_value + self.preprocessor.featurizer.pad_to = pad_to_value + if mode is True: + self.encoder.unfreeze() + self.transf_decoder.unfreeze() + logging.set_verbosity(logging_level) + + return hypotheses + + def _setup_dataloader_from_config(self, config: Optional[Dict]): + + dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config( + config=config, + local_rank=self.local_rank, + global_rank=self.global_rank, + world_size=self.world_size, + tokenizer=self.tokenizer, + preprocessor_cfg=self.cfg.get("preprocessor", None), + ) + + if dataset is None: + return None + + shuffle = config['shuffle'] + if config.get('is_tarred', False): + shuffle = False + + if hasattr(dataset, 'collate_fn'): + collate_fn = dataset.collate_fn + else: + collate_fn = dataset.datasets[0].collate_fn + + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + shuffle=shuffle, + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) + + def setup_training_data(self, train_data_config: Optional[DictConfig]): + + # create audio-only data loader + self._update_dataset_config(dataset_name='train', config=train_data_config) + self._train_dl = self._setup_dataloader_from_config(config=train_data_config) + + # Need to set this because if using an IterableDataset, the length of the + # dataloader is the total number of samples rather than the number of batches, + # and this messes up the tqdm progress bar. So we set the number of steps manually + # (to the correct number) to fix this. + if 'is_tarred' in train_data_config and train_data_config['is_tarred']: + # We also need to check if limit_train_batches is already set. + # If it's an int, we assume that the user has set it to something sane, + # i.e. <= # training batches, and don't change it. Otherwise, adjust + # batches accordingly if it's a float (including 1.0). + if self._trainer is not None and isinstance(self._trainer.limit_train_batches, float): + self._trainer.limit_train_batches = int( + self._trainer.limit_train_batches + * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_config['batch_size']) + ) + elif self._trainer is None: + logging.warning( + "Model Trainer was not set before constructing the dataset, incorrect number of " + "training batches will be used. Please set the trainer and rebuild the dataset." + ) + + def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]): + """ + Sets up the validation data loader via a Dict-like object. + Args: + val_data_config: A config that contains the information regarding construction + of an ASR Training dataset. + Supported Datasets: + - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` + - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` + """ + if 'shuffle' not in val_data_config: + val_data_config['shuffle'] = False + + # preserve config + self._update_dataset_config(dataset_name='validation', config=val_data_config) + self._validation_dl = self._setup_dataloader_from_config(config=val_data_config) + + def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]): + """ + Sets up the test data loader via a Dict-like object. + Args: + test_data_config: A config that contains the information regarding construction + of an ASR Training dataset. + Supported Datasets: + - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` + - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` + """ + if 'shuffle' not in test_data_config: + test_data_config['shuffle'] = False + + # preserve config + self._update_dataset_config(dataset_name='test', config=test_data_config) + self._test_dl = self._setup_dataloader_from_config(config=test_data_config) + + @property + def input_types(self) -> Optional[Dict[str, NeuralType]]: + if hasattr(self.preprocessor, '_sample_rate'): + input_signal_eltype = AudioSignal(freq=self.preprocessor._sample_rate) + else: + input_signal_eltype = AudioSignal() + return { + "input_signal": NeuralType(('B', 'T'), input_signal_eltype, optional=True), + "input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), + "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True), + "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), + "transcript": NeuralType(('B', 'T'), LabelsType(), optional=True), + "transcript_length": NeuralType(tuple('B'), LengthsType(), optional=True), + "sample_id": NeuralType(tuple('B'), LengthsType(), optional=True), + } + + @property + def output_types(self) -> Optional[Dict[str, NeuralType]]: + return { + "transf_log_probs": NeuralType(('B', 'T', 'D'), LogprobsType()), + "encoded_lengths": NeuralType(tuple('B'), LengthsType()), + "encoder_states": NeuralType(('B', 'T', 'D'), ChannelType()), + "encoder_mask": NeuralType(('B', 'T'), MaskType()), + } + + @typecheck() + def forward( + self, + input_signal=None, + input_signal_length=None, + processed_signal=None, + processed_signal_length=None, + transcript=None, + transcript_length=None, + ): + """ + Forward pass of the model. + Args: + input_signal: Tensor that represents a batch of raw audio signals, + of shape [B, T]. T here represents timesteps, with 1 second of audio represented as + `self.sample_rate` number of floating point values. + input_signal_length: Vector of length B, that contains the individual lengths of the audio + sequences. + processed_signal: Tensor that represents a batch of processed audio signals, + of shape (B, D, T) that has undergone processing via some DALI preprocessor. + processed_signal_length: Vector of length B, that contains the individual lengths of the + processed audio sequences. + Returns: + A tuple of 3 elements - + 1) The log probabilities tensor of shape [B, T, D]. + 2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B]. + 3) The greedy token predictions of the model of shape [B, T] (via argmax) + """ + has_input_signal = input_signal is not None and input_signal_length is not None + has_processed_signal = processed_signal is not None and processed_signal_length is not None + if (has_input_signal ^ has_processed_signal) == False: + raise ValueError( + f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive " + " with ``processed_signal`` and ``processed_signal_len`` arguments." + ) + + if not has_processed_signal: + processed_signal, processed_signal_length = self.preprocessor( + input_signal=input_signal, length=input_signal_length + ) + + if self.spec_augmentation is not None and self.training: + processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) + + encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length) + + enc_states = encoded.permute(0, 2, 1) + enc_states = self.adapter(enc_states) + enc_mask = lens_to_mask(encoded_len, enc_states.shape[1]).to(enc_states.dtype) + if self.use_transf_encoder: + enc_states = self.transf_encoder(encoder_states=enc_states, encoder_mask=enc_mask) + + transf_log_probs = None + if transcript is not None: + dec_mask = lens_to_mask(transcript_length, transcript.shape[1]).to(transcript.dtype) + dec_states = self.transf_decoder( + input_ids=transcript, decoder_mask=dec_mask, encoder_embeddings=enc_states, encoder_mask=enc_mask + ) + transf_log_probs = self.log_softmax(hidden_states=dec_states) + + return transf_log_probs, encoded_len, enc_states, enc_mask + + def compute_audio_loss(self, batch): + + if batch is None: + return 0 + + signal, signal_len, transcript, transcript_len = batch + input_ids, labels = transcript[:, :-1], transcript[:, 1:] + + transf_log_probs, encoded_len, enc_states, enc_mask = self.forward( + input_signal=signal, + input_signal_length=signal_len, + transcript=input_ids, + transcript_length=transcript_len, + ) + + transf_loss = self.transf_loss(log_probs=transf_log_probs, labels=labels) + + return transf_loss + + # PTL-specific methods + def training_step(self, batch, batch_nb): + + audio_loss = self.compute_audio_loss(batch) + + tensorboard_logs = { + 'train_loss': audio_loss, + 'learning_rate': self._optimizer.param_groups[0]['lr'], + } + + return {'loss': audio_loss, 'log': tensorboard_logs} + + def validation_step(self, batch, batch_idx, dataloader_idx=0, eval_mode="val"): + signal, signal_len, transcript, transcript_len = batch + input_ids, labels = transcript[:, :-1], transcript[:, 1:] + + if isinstance(batch, DALIOutputs) and batch.has_processed_signal: + transf_log_probs, encoded_len, enc_states, enc_mask = self.forward( + processed_signal=signal, + processed_signal_length=signal_len, + transcript=input_ids, + transcript_length=transcript_len, + ) + else: + transf_log_probs, encoded_len, enc_states, enc_mask = self.forward( + input_signal=signal, + input_signal_length=signal_len, + transcript=input_ids, + transcript_length=transcript_len, + ) + + beam_hypotheses = self.beam_search( + encoder_hidden_states=enc_states, encoder_input_mask=enc_mask, return_beam_scores=False + ) + transf_loss = self.transf_loss(log_probs=transf_log_probs, labels=labels) + + ground_truths = [self.tokenizer.ids_to_text(sent) for sent in transcript.detach().cpu().tolist()] + translations = [self.tokenizer.ids_to_text(sent) for sent in beam_hypotheses.detach().cpu().tolist()] + + self.val_loss(loss=transf_loss, num_measurements=transf_log_probs.shape[0] * transf_log_probs.shape[1]) + + return {f'{eval_mode}_loss': transf_loss, 'translations': translations, 'ground_truths': ground_truths} + + def test_step(self, batch, batch_idx, dataloader_idx=0): + return self.validation_step(batch, batch_idx, dataloader_idx, eval_mode="test") + + def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0, eval_mode: str = "val"): + """ + Called at the end of validation to aggregate outputs. + :param outputs: list of individual outputs of each validation step. + """ + if not outputs: + return + + if isinstance(outputs[0], dict): + outputs = [outputs] + + for output in outputs: + eval_loss = getattr(self, 'val_loss').compute() + translations = list(itertools.chain(*[x['translations'] for x in output])) + ground_truths = list(itertools.chain(*[x['ground_truths'] for x in output])) + + # Gather translations and ground truths from all workers + tr_and_gt = [None for _ in range(self.world_size)] + # we also need to drop pairs where ground truth is an empty string + if self.world_size > 1: + dist.all_gather_object( + tr_and_gt, [(t, g) for (t, g) in zip(translations, ground_truths) if g.strip() != ''] + ) + else: + tr_and_gt[0] = [(t, g) for (t, g) in zip(translations, ground_truths) if g.strip() != ''] + + if self.global_rank == 0: + _translations = [] + _ground_truths = [] + for rank in range(0, self.world_size): + _translations += [t for (t, g) in tr_and_gt[rank]] + _ground_truths += [g for (t, g) in tr_and_gt[rank]] + + sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="13a") + sb_score = sacre_bleu.score * self.world_size + + wer_scores, wer_words = 0, 0 + for h, r in zip(_translations, _ground_truths): + wer_words += len(r.split()) + wer_scores += editdistance.eval(h.split(), r.split()) + wer_score = 1.0 * wer_scores * self.world_size / wer_words + + else: + sb_score = 0.0 + wer_score = 0.0 + + self.log(f"{eval_mode}_loss", eval_loss, sync_dist=True) + self.log(f"{eval_mode}_sacreBLEU", sb_score, sync_dist=True) + self.log(f"{eval_mode}_WER", wer_score, sync_dist=True) + self.val_loss.reset() + + def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): + return self.multi_validation_epoch_end(outputs, dataloader_idx, eval_mode="test") + + def test_dataloader(self): + if self._test_dl is not None: + return self._test_dl + + def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': + """ + Setup function for a temporary data loader which wraps the provided audio file. + Args: + config: A python dictionary which contains the following keys: + paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \ + Recommended length per file is between 5 and 25 seconds. + batch_size: (int) batch size to use during inference. \ + Bigger will result in better throughput performance but would use more memory. + temp_dir: (str) A temporary directory where the audio manifest is temporarily + stored. + Returns: + A pytorch DataLoader for the given audio file(s). + """ + batch_size = min(config['batch_size'], len(config['paths2audio_files'])) + dl_config = { + 'manifest_filepath': os.path.join(config['temp_dir'], 'manifest.json'), + 'sample_rate': self.preprocessor._sample_rate, + 'batch_size': batch_size, + 'trim_silence': False, + 'shuffle': False, + 'num_workers': min(batch_size, os.cpu_count() - 1), + 'pin_memory': True, + } + + temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) + return temporary_datalayer diff --git a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py index 0ab0cb784273..906154213ea1 100644 --- a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py +++ b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py @@ -299,7 +299,7 @@ def create_spt_model( byte_fallback: If , fallback to a byte sequence of the character. split_digits: If true, digits are split into individual tokens. split_by_whitespace: Whether to respect white space while creating subwords. If False, will learn merges across whitespace. - split_by_unicode_script: Whether to include multiple Unicode scripts. Ex. is Arabic diacritics which are considered part of the letter (عِدَّةُ) + split_by_unicode_script: Whether to include multiple Unicode scripts. Ex. is Arabic diacritics which are considered part of the letter (عِدَّةُ) """ if not data_file or not os.path.exists(data_file): From ea9d3fd7acd2f96dc7109752882c0ca53addf971 Mon Sep 17 00:00:00 2001 From: Evelina <10428420+ekmb@users.noreply.github.com> Date: Tue, 18 Jul 2023 23:32:20 -0700 Subject: [PATCH 118/123] remove pos emb from state dict for old models (#7068) * remove pos emb from state dict Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * move to nlp_model Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update comment Signed-off-by: Evelina * fix nmt test Signed-off-by: Evelina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix nmt test Signed-off-by: Evelina --------- Signed-off-by: Evelina Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- nemo/collections/nlp/models/nlp_model.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/nlp_model.py b/nemo/collections/nlp/models/nlp_model.py index 032a7449c27e..d739efa88485 100644 --- a/nemo/collections/nlp/models/nlp_model.py +++ b/nemo/collections/nlp/models/nlp_model.py @@ -16,7 +16,7 @@ import hashlib import json import os -from typing import Any, Optional +from typing import Any, Mapping, Optional from omegaconf import DictConfig, OmegaConf from pytorch_lightning import Trainer @@ -385,3 +385,13 @@ def load_from_checkpoint( finally: cls._set_model_restore_state(is_being_restored=False) return checkpoint + + def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True): + # starting with trasformers v4.31.0, buffer for position_ids is persistent=False + if ( + self.bert_model is not None + and "position_ids" not in self.bert_model.embeddings._modules + and "bert_model.embeddings.position_ids" in state_dict + ): + del state_dict["bert_model.embeddings.position_ids"] + super(NLPModel, self).load_state_dict(state_dict, strict=strict) From b1aa4c2d8ba30a819e8fad85746799d0bc9e48ad Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Wed, 19 Jul 2023 16:03:56 +0400 Subject: [PATCH 119/123] Fix typo in ASR-TTS tutorial (#7049) Signed-off-by: Vladimir Bataev --- tutorials/asr/ASR_TTS_Tutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/asr/ASR_TTS_Tutorial.ipynb b/tutorials/asr/ASR_TTS_Tutorial.ipynb index 939ef8a28d29..007713ee3cc2 100644 --- a/tutorials/asr/ASR_TTS_Tutorial.ipynb +++ b/tutorials/asr/ASR_TTS_Tutorial.ipynb @@ -685,7 +685,7 @@ "id": "2de58fbb-50be-42cd-9095-01cacfdb6931", "metadata": {}, "source": [ - "## Using Scritps (examples)" + "## Using Scripts (examples)" ] }, { From 1dde2676ad8b955cd7d75a2098e0a41e79b9fa27 Mon Sep 17 00:00:00 2001 From: Vitaly Lavrukhin Date: Wed, 19 Jul 2023 07:36:37 -0700 Subject: [PATCH 120/123] Fixed tutorial's name (#7047) Signed-off-by: Vitaly Lavrukhin Co-authored-by: Vladimir Bataev --- docs/source/starthere/tutorials.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/starthere/tutorials.rst b/docs/source/starthere/tutorials.rst index 586ce46c0c38..3a0998197732 100644 --- a/docs/source/starthere/tutorials.rst +++ b/docs/source/starthere/tutorials.rst @@ -107,8 +107,8 @@ To run a tutorial: - Multi-lingual ASR - `Multi-lingual ASR `_ * - ASR - - Hybrid ASR-TTS Models Tutorial - - `Multi-lingual ASR `_ + - Hybrid ASR-TTS Models + - `Hybrid ASR-TTS Models `_ * - ASR - ASR Confidence Estimation - `ASR Confidence Estimation `_ From 6704a79d94eea55b07efa30b3c39c2998b2da5ed Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 19 Jul 2023 12:55:29 -0700 Subject: [PATCH 121/123] Fix documentation for Numba (#7065) (#7077) * Fix documentation for Numba * Update force float32 flag dynamically * Update force float32 flag dynamically * Fix nemo version --------- Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar Co-authored-by: Eric Harper --- README.rst | 4 ++-- docs/source/nlp/api.rst | 2 +- docs/source/starthere/intro.rst | 4 ++-- nemo/collections/asr/losses/rnnt.py | 7 +++++-- nemo/core/utils/numba_utils.py | 11 ++++++----- nemo/utils/model_utils.py | 3 ++- 6 files changed, 18 insertions(+), 13 deletions(-) diff --git a/README.rst b/README.rst index 7ac95b8cef70..0d3b23a964c0 100644 --- a/README.rst +++ b/README.rst @@ -132,8 +132,8 @@ Built for speed, NeMo can utilize NVIDIA's Tensor Cores and scale out training t Requirements ------------ -1) Python 3.8 or above -2) Pytorch 1.10.0 or above +1) Python 3.9 or above +2) Pytorch 1.13.1 or above 3) NVIDIA GPU for training Documentation diff --git a/docs/source/nlp/api.rst b/docs/source/nlp/api.rst index 0822ade0224c..b13dedca300f 100755 --- a/docs/source/nlp/api.rst +++ b/docs/source/nlp/api.rst @@ -124,7 +124,7 @@ Datasets .. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset :show-inheritance: -.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTChatDataset +.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset.GPTSFTChatDataset :show-inheritance: .. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.retro_dataset.RETRODataset diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst index 2e0e272c93f4..70426d3fe4a0 100644 --- a/docs/source/starthere/intro.rst +++ b/docs/source/starthere/intro.rst @@ -34,9 +34,9 @@ Prerequisites Before you begin using NeMo, it's assumed you meet the following prerequisites. -#. You have Python version 3.6, 3.7 or 3.8. +#. You have Python version 3.9, 3.10. -#. You have Pytorch version 1.8.1. +#. You have Pytorch version 1.13.1 or 2.0+. #. You have access to an NVIDIA GPU for training. diff --git a/nemo/collections/asr/losses/rnnt.py b/nemo/collections/asr/losses/rnnt.py index a884f7d3cc68..894be6319c99 100644 --- a/nemo/collections/asr/losses/rnnt.py +++ b/nemo/collections/asr/losses/rnnt.py @@ -99,7 +99,7 @@ class RNNTLossConfig: min_version='0.53.0', is_available=NUMBA_RNNT_AVAILABLE, installation_msg=NUMBA_INSTALLATION_MESSAGE, - force_float32=not numba_utils.NUMBA_FP16_SUPPORTED, + force_float32=False, # This is only temporarily false, will be dynamically updated during resolution ), "pytorch": RNNTLossConfig( loss_name="pytorch", @@ -258,6 +258,9 @@ def resolve_rnnt_loss(loss_name: str, blank_idx: int, loss_kwargs: dict = None) _warn_unused_additional_kwargs(loss_name, loss_kwargs) elif loss_name == 'warprnnt_numba': + # Update loss config's forced float32 flag if set to None + loss_config.force_float32 = not numba_utils.is_numba_cuda_fp16_supported() + fastemit_lambda = loss_kwargs.pop('fastemit_lambda', 0.0) clamp = loss_kwargs.pop('clamp', -1.0) loss_func = RNNTLossNumba(blank=blank_idx, reduction='none', fastemit_lambda=fastemit_lambda, clamp=clamp) @@ -444,7 +447,7 @@ def forward(self, log_probs, targets, input_lengths, target_lengths): max_targets_len = target_lengths.max() # Force cast joint to float32 - if not self._force_float32 and numba_utils.NUMBA_FP16_SUPPORTED: + if not self._force_float32 and numba_utils.is_numba_cuda_fp16_supported(): # Execute the kernel in fp16 pass elif self._force_float32 and log_probs.dtype != torch.float32: diff --git a/nemo/core/utils/numba_utils.py b/nemo/core/utils/numba_utils.py index 04010a2f7db4..9117b2ea1010 100644 --- a/nemo/core/utils/numba_utils.py +++ b/nemo/core/utils/numba_utils.py @@ -29,9 +29,6 @@ __NUMBA_MINIMUM_VERSION__ = os.environ.get("NEMO_NUMBA_MINVER", __NUMBA_DEFAULT_MINIMUM_VERSION__) __NUMBA_MINIMUM_VERSION_FP16_SUPPORTED__ = "0.57.0" -NUMBA_FP16_SUPPORTED = model_utils.check_lib_version( - 'numba', __NUMBA_MINIMUM_VERSION_FP16_SUPPORTED__, operator=operator.ge -)[0] NUMBA_INSTALLATION_MESSAGE = ( @@ -171,12 +168,16 @@ def is_numba_cuda_fp16_supported(return_reason: bool = False) -> Union[bool, Tup use_nvidia_binding = False reason += "Env variable `NUMBA_CUDA_USE_NVIDIA_BINDING` is not available or has not set to `1`." - if NUMBA_FP16_SUPPORTED: + numba_fp16_version_correct = model_utils.check_lib_version( + 'numba', __NUMBA_MINIMUM_VERSION_FP16_SUPPORTED__, operator=operator.ge + )[0] + + if numba_fp16_version_correct: reason += f"Numba CUDA FP16 is supported in installed numba version." else: reason += f"Numba CUDA FP16 is not supported in installed numba version." - result = use_nvidia_binding and NUMBA_FP16_SUPPORTED + result = use_nvidia_binding and numba_fp16_version_correct if return_reason: return result, reason diff --git a/nemo/utils/model_utils.py b/nemo/utils/model_utils.py index 211ffdcdf11e..42a0b108944d 100644 --- a/nemo/utils/model_utils.py +++ b/nemo/utils/model_utils.py @@ -13,6 +13,7 @@ # limitations under the License. import copy +import importlib import os from dataclasses import dataclass, is_dataclass from enum import Enum @@ -554,7 +555,7 @@ def check_lib_version(lib_name: str, checked_version: str, operator) -> Tuple[Op if '.' in lib_name: mod = import_class_by_path(lib_name) else: - mod = __import__(lib_name) + mod = importlib.import_module(lib_name) if hasattr(mod, '__version__'): lib_ver = version.Version(mod.__version__) From 39aff5ca3d52247e136a86429d567255c02a5d44 Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Wed, 19 Jul 2023 16:45:35 -0400 Subject: [PATCH 122/123] Update Frame-VAD doc and fix onnx export (#7076) * update fvad doc Signed-off-by: stevehuang52 * fix typo Signed-off-by: stevehuang52 * update fvad example Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 * fix onnx export Signed-off-by: stevehuang52 * update test Signed-off-by: stevehuang52 * refactor Signed-off-by: stevehuang52 * update doc Signed-off-by: stevehuang52 * update Signed-off-by: stevehuang52 --------- Signed-off-by: stevehuang52 Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> --- .../conf/vad/frame_vad_infer_postprocess.yaml | 3 +- .../speech_classification/frame_vad_infer.py | 24 ++++++++--- .../asr/models/classification_models.py | 43 +++++++++++++++++++ nemo/collections/asr/parts/utils/vad_utils.py | 6 ++- .../asr/test_asr_classification_model.py | 4 +- 5 files changed, 69 insertions(+), 11 deletions(-) diff --git a/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml b/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml index d759a809ec37..30c082aff91f 100644 --- a/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml +++ b/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml @@ -1,6 +1,7 @@ name: &name "vad_inference_postprocessing" -dataset: null # Path of json file of evaluation data. Audio files should have unique names +input_manifest: null # Path of json file of evaluation data. Audio files should have unique names +output_dir: null # Path to output directory where results will be stored num_workers: 12 sample_rate: 16000 evaluate: False # whether to get AUROC and DERs, the manifest must contains groundtruth if enabled diff --git a/examples/asr/speech_classification/frame_vad_infer.py b/examples/asr/speech_classification/frame_vad_infer.py index f716eb45bb64..594cc9637d73 100644 --- a/examples/asr/speech_classification/frame_vad_infer.py +++ b/examples/asr/speech_classification/frame_vad_infer.py @@ -21,7 +21,8 @@ ## Usage: python frame_vad_infer.py \ --config-path="../conf/vad" --config-name="frame_vad_infer_postprocess" \ - dataset= + input_manifest= \ + output_dir= The manifest json file should have the following format (each line is a Python dictionary): {"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000} @@ -58,15 +59,25 @@ @hydra_runner(config_path="../conf/vad", config_name="frame_vad_infer_postprocess") def main(cfg): - if not cfg.dataset: + if not cfg.input_manifest: raise ValueError("You must input the path of json file of evaluation data") + output_dir = cfg.output_dir if cfg.output_dir else "frame_vad_outputs" + if os.path.exists(output_dir): + logging.warning( + f"Output directory {output_dir} already exists, use this only if you're tuning post-processing params." + ) + Path(output_dir).mkdir(parents=True, exist_ok=True) + + cfg.frame_out_dir = os.path.join(output_dir, "frame_preds") + cfg.smoothing_out_dir = os.path.join(output_dir, "smoothing_preds") + cfg.rttm_out_dir = os.path.join(output_dir, "rttm_preds") - # each line of dataset should be have different audio_filepath and unique name to simplify edge cases or conditions - logging.info(f"Loading manifest file {cfg.dataset}") + # each line of input_manifest should be have different audio_filepath and unique name to simplify edge cases or conditions + logging.info(f"Loading manifest file {cfg.input_manifest}") manifest_orig, key_labels_map, key_rttm_map = frame_vad_infer_load_manifest(cfg) # Prepare manifest for streaming VAD - manifest_vad_input = cfg.dataset + manifest_vad_input = cfg.input_manifest if cfg.prepare_manifest.auto_split: logging.info("Split long audio file to avoid CUDA memory issue") logging.debug("Try smaller split_duration if you still have CUDA memory issue") @@ -76,6 +87,7 @@ def main(cfg): 'split_duration': cfg.prepare_manifest.split_duration, 'num_workers': cfg.num_workers, 'prepared_manifest_vad_input': cfg.prepared_manifest_vad_input, + 'out_dir': output_dir, } manifest_vad_input = prepare_manifest(config) else: @@ -171,7 +183,7 @@ def main(cfg): key_pred_rttm_map[key] = entry['rttm_filepath'] if not cfg.out_manifest_filepath: - out_manifest_filepath = "manifest_vad_output.json" + out_manifest_filepath = os.path.join(output_dir, "manifest_vad_output.json") else: out_manifest_filepath = cfg.out_manifest_filepath write_manifest(out_manifest_filepath, manifest_new) diff --git a/nemo/collections/asr/models/classification_models.py b/nemo/collections/asr/models/classification_models.py index 432674225f5a..264e9cef99f8 100644 --- a/nemo/collections/asr/models/classification_models.py +++ b/nemo/collections/asr/models/classification_models.py @@ -35,6 +35,7 @@ from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.core.neural_types import * from nemo.utils import logging, model_utils +from nemo.utils.cast_utils import cast_all __all__ = ['EncDecClassificationModel', 'EncDecRegressionModel'] @@ -851,6 +852,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.eval_loop_cnt = 0 self.ratio_threshold = cfg.get('ratio_threshold', 0.2) super().__init__(cfg=cfg, trainer=trainer) + self.decoder.output_types = self.output_types + self.decoder.output_types_for_export = self.output_types @classmethod def list_available_models(cls) -> Optional[List[PretrainedModelInfo]]: @@ -1148,3 +1151,43 @@ def get_metric_logits_labels(self, logits, labels, masks): labels = labels.gather(dim=0, index=idx.view(-1)) return logits, labels + + def forward_for_export( + self, input, length=None, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None + ): + """ + This forward is used when we need to export the model to ONNX format. + Inputs cache_last_channel and cache_last_time are needed to be passed for exporting streaming models. + Args: + input: Tensor that represents a batch of raw audio signals, + of shape [B, T]. T here represents timesteps. + length: Vector of length B, that contains the individual lengths of the audio sequences. + cache_last_channel: Tensor of shape [N, B, T, H] which contains the cache for last channel layers + cache_last_time: Tensor of shape [N, B, H, T] which contains the cache for last time layers + N is the number of such layers which need caching, B is batch size, H is the hidden size of activations, + and T is the length of the cache + + Returns: + the output of the model + """ + enc_fun = getattr(self.input_module, 'forward_for_export', self.input_module.forward) + if cache_last_channel is None: + encoder_output = enc_fun(audio_signal=input, length=length) + if isinstance(encoder_output, tuple): + encoder_output = encoder_output[0] + else: + encoder_output, length, cache_last_channel, cache_last_time, cache_last_channel_len = enc_fun( + audio_signal=input, + length=length, + cache_last_channel=cache_last_channel, + cache_last_time=cache_last_time, + cache_last_channel_len=cache_last_channel_len, + ) + + dec_fun = getattr(self.output_module, 'forward_for_export', self.output_module.forward) + ret = dec_fun(hidden_states=encoder_output.transpose(1, 2)) + if isinstance(ret, tuple): + ret = ret[0] + if cache_last_channel is not None: + ret = (ret, length, cache_last_channel, cache_last_time, cache_last_channel_len) + return cast_all(ret, from_dtype=torch.float16, to_dtype=torch.float32) diff --git a/nemo/collections/asr/parts/utils/vad_utils.py b/nemo/collections/asr/parts/utils/vad_utils.py index e4f024d231ad..d8860a0c7cff 100644 --- a/nemo/collections/asr/parts/utils/vad_utils.py +++ b/nemo/collections/asr/parts/utils/vad_utils.py @@ -275,7 +275,9 @@ def generate_overlap_vad_seq( if out_dir: overlap_out_dir = out_dir else: - overlap_out_dir = frame_pred_dir + "/overlap_smoothing_output" + "_" + smoothing_method + "_" + str(overlap) + overlap_out_dir = os.path.join( + frame_pred_dir, "/overlap_smoothing_output" + "_" + smoothing_method + "_" + str(overlap) + ) if not os.path.exists(overlap_out_dir): os.mkdir(overlap_out_dir) @@ -732,7 +734,7 @@ def generate_vad_segment_table( if not out_dir: out_dir_name = "seg_output_" for key in postprocessing_params: - out_dir_name = out_dir_name + str(key) + str(postprocessing_params[key]) + "-" + out_dir_name = out_dir_name + "-" + str(key) + str(postprocessing_params[key]) out_dir = os.path.join(vad_pred_dir, out_dir_name) diff --git a/tests/collections/asr/test_asr_classification_model.py b/tests/collections/asr/test_asr_classification_model.py index 876bb6073a38..3888cb30204c 100644 --- a/tests/collections/asr/test_asr_classification_model.py +++ b/tests/collections/asr/test_asr_classification_model.py @@ -94,8 +94,8 @@ def frame_classification_model(): } decoder = { - 'cls': 'nemo.collections.asr.modules.ConvASRDecoderClassification', - 'params': {'feat_in': 32, 'num_classes': 5,}, + 'cls': 'nemo.collections.common.parts.MultiLayerPerceptron', + 'params': {'hidden_size': 32, 'num_classes': 5,}, } modelConfig = DictConfig( From d300a3af0af517c927f5e095cdd45abba65cc1c0 Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Thu, 20 Jul 2023 11:42:21 -0700 Subject: [PATCH 123/123] memmap worker arg (#7062) * memmap worker arg Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update Signed-off-by: arendu * update Signed-off-by: arendu --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../tuning/conf/megatron_gpt_peft_tuning_config.yaml | 3 +++ .../tuning/conf/megatron_gpt_sft.yaml | 3 +++ .../language_modeling/megatron/gpt_sft_dataset.py | 11 +++++++++-- .../language_modeling/megatron_gpt_sft_model.py | 3 +++ 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml index 799d105aae7c..d26dd2922088 100755 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml @@ -116,6 +116,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: True num_workers: 0 + memmap_workers: null pin_memory: True max_seq_length: 2048 min_seq_length: 1 @@ -143,6 +144,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: False num_workers: 0 + memmap_workers: ${model.data.train_ds.memmap_workers} pin_memory: True max_seq_length: 2048 min_seq_length: 1 @@ -170,6 +172,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: False num_workers: 4 + memmap_workers: ${model.data.train_ds.memmap_workers} pin_memory: True max_seq_length: 2048 min_seq_length: 1 diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml index 0e3f0d712dd6..f15138c99264 100644 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml @@ -82,6 +82,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: True num_workers: 4 + memmap_workers: null pin_memory: True max_seq_length: 2048 min_seq_length: 1 @@ -109,6 +110,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: True num_workers: 4 + memmap_workers: ${model.data.train_ds.memmap_workers} pin_memory: True max_seq_length: 2048 min_seq_length: 1 @@ -137,6 +139,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: True num_workers: 4 + memmap_workers: ${model.data.train_ds.memmap_workers} pin_memory: True max_seq_length: 2048 min_seq_length: 1 diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py index 756494f2f315..da3d03199c2e 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + import numpy as np import torch @@ -40,12 +42,13 @@ def __init__( label_key: str = "answer", separate_prompt_and_response_with_newline: bool = False, answer_only_loss: bool = True, - truncation_field: str = "answer", + truncation_field: str = "context", pad_to_max_length: bool = False, # (@adithyare) allows for much faster training especially in PEFT settings. index_mapping_dir: str = None, prompt_template: str = None, virtual_tokens: int = 0, tokens_to_generate: int = 0, + memmap_workers: Optional[int] = None, ): """ file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} @@ -94,7 +97,11 @@ def __init__( assert self.truncation_field in ["answer", "context"] self.indexed_dataset = JSONLMemMapDataset( - dataset_paths=[file_path], tokenizer=None, header_lines=0, index_mapping_dir=index_mapping_dir + dataset_paths=[file_path], + tokenizer=None, + header_lines=0, + index_mapping_dir=index_mapping_dir, + workers=memmap_workers, ) # Will be None after this call if `max_num_samples` is None diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index c390a8c440bf..95108e90f087 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -266,6 +266,9 @@ def _build_dataset(self, data_cfg, is_train=True): tokens_to_generate=data_cfg.get( 'tokens_to_generate', 0 ), # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure. + memmap_workers=data_cfg.get( + 'memmap_workers', None + ), # used to set num. of workers to create the memmap index files ) datasets.append(dataset)