Merge branch 'main' into dpykhtar/ckpt_convert_fix

NVIDIA · Oct 18, 2024 · 471d5c4 · 471d5c4
2 parents 9d4bd30 + 5b47a94
commit 471d5c4
Show file tree

Hide file tree

Showing 68 changed files with 1,335 additions and 541 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -131,16 +131,16 @@ jobs:
         ### \'\'
   
   # L0: GPU unit tests
-  OPTIONAL_L0_Unit_Tests_GPU_ASR:
+  L0_Unit_Tests_GPU_ASR:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure
        TIMEOUT: 20
+       # TODO: remove this hack
        SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads
-       IS_OPTIONAL: true
+         python -c "from nemo.collections.asr.models import ASRModel" && NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads
 
   L0_Unit_Tests_GPU_Audio:
      needs: [cicd-test-container-setup]
@@ -1212,18 +1212,6 @@ jobs:
         matmul_precision=medium
       AFTER_SCRIPT: |
         rm -rf preds.json
-  
-
-  # L2: Transducer alignment
-  OPTIONAL_L2_Transducer_alignment_Running_pytest:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_Transducer_alignment_Running_pytest') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 --with_downloads
-      IS_OPTIONAL: true
 
   # L2: Segmentation Tool
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav:
@@ -5182,6 +5170,22 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf tests/collections/llm/t5_finetune_results/${{ github.run_id }}
 
+  L2_NeMo_2_T5_LoRA:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_T5_LoRA') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_finetuning.py \
+        --devices=2 \
+        --max-steps=250 \
+        --peft=lora \
+        --experiment-dir=tests/collections/llm/t5_peft_results/${{ github.run_id }} \
+        --checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_150steps
+      AFTER_SCRIPT: |
+        rm -rf tests/collections/llm/t5_peft_results/${{ github.run_id }}
+
   L2_NeMo_2_Mixtral_Pretraining:
       needs: [cicd-test-container-setup]
       uses: ./.github/workflows/_test_template.yml
@@ -5456,7 +5460,7 @@ jobs:
       - gpu-test
       - cicd-test-container-setup
 
-      #- OPTIONAL_L0_Unit_Tests_GPU_ASR
+      - L0_Unit_Tests_GPU_ASR
       - L0_Unit_Tests_GPU_Audio
       - L0_Unit_Tests_GPU_Common
       - L0_Unit_Tests_GPU_LLM
@@ -5507,7 +5511,6 @@ jobs:
       - L2_ASR_Adapters_Linear_Adapters
       - L2_ASR_Adapters_RelPos_MHA_Adapters
       - L2_Speech_Transcription_Speech_to_Text_Transcribe
-      #- OPTIONAL_L2_Transducer_alignment_Running_pytest
       - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav
       - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
       - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
@@ -5589,6 +5592,7 @@ jobs:
       - L2_NeMo_2_SSM_Finetuning
       - L2_NeMo_2_T5_Pretraining
       - L2_NeMo_2_T5_Finetuning
+      - L2_NeMo_2_T5_LoRA
       - L2_NeMo_2_GPT_SFT_TP1PP1_MBS1
       - L2_NeMo_2_GPT_SFT_TP1PP1_MBS2
       - L2_NeMo_2_GPT_SFT_TP1PP2_MBS2

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.17.0
-ARG MCORE_TAG=772faca1f8d5030621b738cbd8e8bb2d8d28f6e6
+ARG MCORE_TAG=0d89fc4c0d4394f915fffff11212d6957652337f
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \

diff --git a/docs/source/asr/asr_language_modeling_and_customization.rst b/docs/source/asr/asr_language_modeling_and_customization.rst
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -66,6 +66,7 @@
     'taming',
     'cytoolz',  # for adapters
     'megatron',  # for nlp
+    "open_clip",
 ]
 
 _skipped_autodoc_mock_imports = ['wrapt', 'numpy']

diff --git a/docs/source/features/optimizations/activation_recomputation.rst b/docs/source/features/optimizations/activation_recomputation.rst
@@ -1,52 +1,42 @@
 Activation Recomputation
 ========================
 
-The input activations of network layers are stored in the device memory to compute the gradients in back-propagation.
-The input activation stores easily saturate the device memory when training a LLM with a large sequence length or a large micro-batch size.
-Check-pointing a few activations and recomputing the rest of activations is a common technique to reduce the need of device memory.
+The input activations of network layers are stored in device memory and are used to compute gradients during back-propagation. When training a LLM with a long sequence length or a large micro-batch size, these input activations can quickly saturate device memory. Checkpointing a few activations and recomputing the rest is a common technique to reduce device memory usage.
 
 Transformer Layer Recomputation
 -------------------------------
 
-NeMo supports Transformer layer recomputation that checkpoints the input of each Transformer layer and recomputes the activations on the rest of the layers.
-Transformer layer recomputation significantly reduces the activation memory usage.
-However, this approach increases per-Transformer layer computation cost by 30%, which comes from re-executing the entire layer forwarding computation.
-NeMo also supports partial Transformer layer recomputation, which is beneficial when recomputing a few Transformer layers would fit the training workload on GPU memory.
-This would avoid recomputing the rest of layers.
+NeMo supports transformer layer recomputation, which checkpoints the input of each transformer layer and recomputes the activations for the remaining layers. This technique significantly reduces activation memory usage. However, it increases the per-transformer layer computation cost by 30% due to re-executing the entire layer’s forward computation.
+NeMo also supports partial transformer layer recomputation, which is beneficial when recomputing a few transformer layers help to reduce enough GPU memory for model to fit. This approach avoids the need to recompute the rest of the layers.
 
 Transformer layer recomputation is enabled by setting ``activations_checkpoint_granularity=full``.
-The number of Transformer layers to recompute can be set using ``activations_checkpoint_num_layers`` along with ``activations_checkpoint_method=block``.
-If one sets ``activations_checkpoint_num_layers`` as the total number of layers, the inputs of all Transformer layers are check-pointed and recomputed.
+The number of transformer layers to recompute can be set using ``activations_checkpoint_num_layers`` along with ``activations_checkpoint_method=block``.
+If you set ``activations_checkpoint_num_layers`` as the total number of layers, the inputs of all transformer layers are checkpointed and recomputed.
 When training with the pipeline parallelism, ``activations_checkpoint_num_layers`` indicates the layers per pipeline stage.
-If the virtual pipelining is used, ``activations_checkpoint_num_layers`` means the layers per virtual pipeline stage.
+When using virtual pipelining, ``activations_checkpoint_num_layers`` specifies the number of layers per virtual pipeline stage.
 
-NeMo also supports checkpointing the input to a block of multiple consecutive Transformer layers meaning that a block of Transformer layers becomes the recomputation granularity.
-This can further save activation memory at the cost of increasing the recomputation buffer memory.
-Thus, it is only beneficial for memory savings when the model has many Transformer layers or the intermediate layers of a Transformer layer hold relatively small activation stores.
-This recomputation mode can be enabled by setting ``activations_checkpoint_method=uniform``, and the number of Transformer layers per recomputation block is set using ``activations_checkpoint_num_layers``.
+NeMo also supports checkpointing the input to a block of multiple consecutive transformer layers, meaning that a block of transformer layers becomes the recomputation granularity. This approach can save activation memory but increases the recomputation buffer memory. Thus, it is only beneficial for memory savings when the model has many transformer layers or when the intermediate layers of a transformer layer hold relatively small activation stores.
+This recomputation mode can be enabled by setting ``activations_checkpoint_method=uniform``, with the number of transformer layers per recomputation block set using ``activations_checkpoint_num_layers``.
 
 Self-attention Recomputation
 ----------------------------
 
 NeMo supports the self-attention recomputation that checkpoints the inputs of each self-attention block and recomputes the intermediate input activations.
-This is a cost-efficient recomputation method; achieves high memory saving with lost recomputation cost.
-The intermediate layers of the self-attention block accounts for the majority portion the activation memory.
+This cost-efficient method achieves high memory savings with minimal recomputation cost.
+The intermediate layers of the self-attention block accounts for the majority of the activation memory.
 This is because the input sizes of softmax, dropout, and qkv dot-product attention layers have the memory complexity of the sequence length square.
 However, their recomputation cost is relatively smaller than the other linear projection layers that are linear with the hidden size square.
 
 Self-attention recomputation is hard-enabled when using FlashAttention, which is supported in Transformer Engine.
-Also, a user can use the self-attention recomputation without FlashAttention by setting ``activations_checkpoint_granularity=selective``.
-
+Also, you can use the self-attention recomputation without FlashAttention by setting ``activations_checkpoint_granularity=selective``.
 Scheme of full and selective checkpointing granularity:
 
 .. image:: https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/asset-post-activation-recomputation-exampe-2.jpg
     :align: center
     :alt: activation-recomputation-example-2
-    :scale: 50%
 
 Scheme of uniform and block checkpointing method (full checkpointing granularity):
 
 .. image:: https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/asset-post-activation-recomputation-exampe-1.jpg
     :align: center
     :alt: activation-recomputation-example-1
-    :scale: 50%
diff --git a/docs/source/features/optimizations/sequence_packing.rst b/docs/source/features/optimizations/sequence_packing.rst
@@ -1,33 +1,29 @@
 Sequence Packing
 ================
 
+This section explains how to use the sequence packing training technique with Supervised Fine-Tuning (SFT) and Parameter-Efficient Fine-Tuning (PEFT).
+
 Sequence Packing for SFT/PEFT
 -----------------------------
 
 Overview
-^^^^^^^^
+########
 
-When finetuning a large language model with either full-parameter or parameter-efficient finetuning, GPU
-underutilization is a common problem due to an inefficient data pipeline. This is because most finetuning datasets have
-a skewed distribution of sequence lengths, with many short sequences and a few long sequences, following Zipf’s Law.
-Transformer models can only take in fixed length inputs, so the input has to be padded with many unused pad tokens,
-which is inefficient in two ways:
+When fine-tuning a large language model, whether using SFT or PEFT methods, GPU underutilization often occurs due to an inefficient data pipeline. This inefficiency arises because most fine-tuning datasets have a skewed distribution of sequence lengths, with many short sequences and a few long ones, following Zipf’s Law. Since transformer models require fixed-length inputs, shorter sequences must be padded with unused tokens, leading to two main inefficiencies:
 
 - Computation performed on the pad values is eventually ignored for model output, resulting in wasted FLOPs.
 - Micro batch size is often limited by the batch which contains longer sequences, so that most other micro batches have
   underutilized GPU memory.
 
-Sequence packing is a training technique where multiple training sequences (examples) are concatenated together into
-one long sequence (pack). This eliminates the need for padding and allows more tokens to be processed in each
-micro batch, maximizing both GPU compute and GPU memory.
+Sequence packing is a training technique where multiple training sequences (examples) are concatenated into one long sequence (pack). This method eliminates the need for padding, allowing more tokens to be processed in each micro batch. As a result, it maximizes both GPU compute and GPU memory utilization.
 
 While sequences for pretraining can be concatenated naively, this is not the case for SFT and instruction fine-tuning
 where each input sequence should be treated individually. The conventional solution is to build an extended attention
 mask to mark the sequence id each token belongs to, and mask out attention values between sequences. However, this
 increases the complexity of attention from :math:`\sum_i {s_i}^2` to :math:`\Big({\sum_i {s_i}}\Big)^2`, where :math:`s_i` is the
 length of the ith subsequence. In practice, the conventional solution puts a limit on the length of packing.
 Instead, NeMo provides a highly optimized version of sequence packing which makes use of variable-length attention
-kernels in FlashAttention and TransformerEngine. With this, attention values between sequences are never calculated,
+kernels in FlashAttention and TransformerEngine. With this approach, attention values between sequences are never calculated,
 so the complexity of attention remains at :math:`\sum_i {s_i}^2`. This allows packing sequences to arbitrary lengths so
 that GPU memory can be fully utilized.
 
@@ -40,31 +36,30 @@ All things considered, NeMo’s implementation of sequence packing provides [#f1
 
 
 How to run SFT/PEFT with packed sequence
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+########################################
 
 Prepare Dataset
-"""""""""""""""
+^^^^^^^^^^^^^^^
 
 We provide a convenient script to pack your SFT or PEFT dataset.
 This script assumes that you already have a prepared dataset file for SFT/PEFT training in NeMo. If you do not, please
 follow `this <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/llama2sft.html#prepare-data>`_ to
-download and prepare the dolly dataset as an example.
+download and prepare the Dolly dataset as an example.
 You will get a file named training.jsonl. The rest of this tutorial also assumes you already have a recipe for
 training with the unpacked dataset.
 
 Two main steps are run in this script:
 
-1. The online processing code in GPTSFTDataset is run (including prompt template manipulation, sequence length
-   truncation, tokenization, etc) and the result is an array of tokenized sequences, represented by indices).
-2. The sequences are grouped by length, and a packing algorithm is run.
+1. The online processing code in GPTSFTDataset is run. This includes tasks such as prompt template manipulation, sequence length truncation, and tokenization. The result is an array of tokenized sequences, represented by indices.
+2. The tokenized sequences are grouped by length and a packing algorithm is run.
 
 You can read more about packing algorithms `here <https://en.wikipedia.org/wiki/Bin_packing_problem#Offline_algorithms>`_.
-Currently, two variants of *first fit* are supported.
-- *first_fit_decreasing* sorts the sequences in decreasing order before applying the first-fit algorithm. It generates a
+Currently, two variants of ``first_fit`` are supported.
+- ``first_fit_decreasing`` sorts the sequences in decreasing order before applying the first-fit algorithm. It generates a
 more optimal packing, but it tends to keep all short sequences together, which may have an impact for convergence.
-- *first_fit_shuffle* runs first-fit in a random order. Packing is less optimal but it keeps the dataset order random.
-The recommendation is to run *first_fit_shuffle* and check the packed sequence lengths. If they are similar to the
-target length (i.e. efficient packing), then use shuffle. Otherwise try *first_fit_decreasing*.
+- ``first_fit_shuffle`` runs first-fit in a random order. Packing is less optimal but it keeps the dataset order random.
+The recommendation is to run ``first_fit_shuffle`` and check the packed sequence lengths. If they are similar to the
+target length (i.e. efficient packing), then use shuffle. Otherwise try ``first_fit_decreasing``.
 
     .. code-block:: bash
 
@@ -79,32 +74,28 @@ target length (i.e. efficient packing), then use shuffle. Otherwise try *first_f
 
 .. note::
 
-    Note 1. If your model or dataset requires non-default configs for conventional SFT/PEFT training in NeMo, you will
-    need to pass in the same configs to ``model.data.train_ds`` as you would for training with unpacked dataset.
+    1. If your model or dataset requires non-default configs for conventional SFT/PEFT training in NeMo, you will need to pass in the same configs to ``model.data.train_ds`` as you would for training with an unpacked dataset.
 
-    Note 2. ``model.data.train_ds.max_seq_length`` is the length to truncate each sequence before packing multiple sequences
-    to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data,
-    and can be determined by examining the distribution of sequence lengths in the dataset.
+    2. ``model.data.train_ds.max_seq_length`` is the length to which each sequence is truncated before packing multiple sequences to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data and can be determined by examining the distribution of sequence lengths in the dataset.
 
-    Note 3. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for
-    each pack size. The output files are named ``<output_folder>/packed_{pack_size}_seed{seed}.npy``.
+    3. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for each pack size. The output files are named ``<output_folder>/packed_{pack_size}_seed{seed}.npy``.
     This argument is a list because you will likely want to experiment with a few ``pack_sizes`` to find out which length
     can fill the GPU memory without exceeding it. Adjusting ``pack_size`` is analogous to adjusting the micro batch size in
     the unpacked case.
 
 
 Adjust Training Config
-""""""""""""""""""""""
+^^^^^^^^^^^^^^^^^^^^^^
 
-To train with packed sequences, you need to change four items in the SFT/PEFT config file
+To train with packed sequences, you need to change four items in the SFT/PEFT config file.
 
-1. Turn on the packed_sequence flag
+1. Turn on the packed_sequence flag:
 
     .. code-block:: bash
 
         ++model.data.train_ds.packed_sequence=True
 
-2. Use the new dataset file instead of the original jsonl file
+2. Use the new dataset file instead of the original jsonl file:
 
     .. code-block:: bash
 
@@ -130,15 +121,14 @@ To train with packed sequences, you need to change four items in the SFT/PEFT co
         model.micro_batch_size=1
         model.global_batch_size=<GBS divided by n>
 
-Now you are all set to finetune your model with a much improved throughput!
+Now, you are all set to fine-tune your model with a much improved throughput!
 
 Sequence Packing for NeVA
 -------------------------
 
-Sequence packing in NeVA (Multimodal LLMs) differs slightly from the LLM SFT/PEFT approach. For details,
-please refer to the documentation below
+Sequence packing with NeVA for multimodal large language models differs from the LLM SFT/PEFT approach. For details, please refer to the documentation below.
 
-:doc:`../multimodal/mllm/sequence_packing`
+:doc:`../../multimodal/mllm/sequence_packing`
 
 .. rubric:: Footnotes