diffusion training

NVIDIA · Oct 10, 2024 · 4e22d4e · 4e22d4e
1 parent 5a173a6
commit 4e22d4e
Show file tree

Hide file tree

Showing 19 changed files with 2,713 additions and 15 deletions.
diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
@@ -49,6 +49,10 @@ jobs:
           run: |
             docker pull nemoci.azurecr.io/nemo_container_${{ github.run_id }}
 
+        - name: Start container
+          run: |
+            docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
+
         - id: main
           name: Run main script
           timeout-minutes: ${{ inputs.TIMEOUT }}
@@ -59,7 +63,7 @@ jobs:
             (  
               set -e
 
-              docker run --rm --runtime=nvidia --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.SCRIPT }}'
+              docker exec nemo_container_${{ github.run_id }} bash -c '${{ inputs.SCRIPT }}'
             ) 2> >(tee err.log)
 
             EXIT_CODE=$?
@@ -73,4 +77,10 @@ jobs:
         - name: after_script
           if: always() && inputs.AFTER_SCRIPT != ':'
           run: |
-            docker run --rm --runtime=nvidia --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
+            docker exec nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
+
+        - name: Container shutdown
+          if: always()
+          run: |
+            docker container stop nemo_container_${{ github.run_id }} || true
+            docker container rm nemo_container_${{ github.run_id }} || true
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -249,16 +249,15 @@ jobs:
          --ignore=tests/utils
 
   # L0: CPU unit tests
-  OPTIONAL_L0_Unit_Tests_CPU_ASR:
+  L0_Unit_Tests_CPU_ASR:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_CPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure-cpu
        TIMEOUT: 20
        SCRIPT: |
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-       IS_OPTIONAL: true
 
   L0_Unit_Tests_CPU_Audio:
      needs: [cicd-test-container-setup]
@@ -2814,7 +2813,7 @@ jobs:
         trainer.accumulate_grad_batches=1 \
         trainer.max_steps=3 \
         trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        exp_manager.exp_dir=/tmp/examples_gpt_pretrain_results_te_autocast \
         ++model.transformer_engine=True \
         ++model.fp8=True \
         ++model.fp8_hybrid=True \
@@ -2838,15 +2837,15 @@ jobs:
         model.bias_dropout_add_fusion=False \
         model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
+        model.num_layers=2 \
         model.hidden_size=256 \
         model.num_attention_heads=8 \
         model.activations_checkpoint_method=block \
         model.activations_checkpoint_granularity=full \
         model.activations_checkpoint_num_layers=1 \
         model.data.validation_drop_last=False \
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+        model.data.index_mapping_dir=/tmp/examples_gpt_pretrain_results_te_autocast/gpt_mapping
 
         python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
         trainer.devices=2 \
@@ -2857,7 +2856,7 @@ jobs:
         trainer.accumulate_grad_batches=1 \
         trainer.max_steps=6 \
         trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        exp_manager.exp_dir=/tmp/examples_gpt_pretrain_results_te_autocast \
         exp_manager.resume_if_exists=True \
         ++model.transformer_engine=True \
         ++model.fp8=True \
@@ -2882,18 +2881,16 @@ jobs:
         model.bias_dropout_add_fusion=False \
         model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
+        model.num_layers=2 \
         model.hidden_size=256 \
         model.num_attention_heads=8 \
         model.activations_checkpoint_method=block \
         model.activations_checkpoint_granularity=full \
         model.activations_checkpoint_num_layers=1 \
         model.data.validation_drop_last=False \
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-    
-        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-        rm -rf examples/nlp/language_modeling/gpt_index_mappings
+        model.data.index_mapping_dir=/tmp/examples_gpt_pretrain_results_te_autocast/gpt_mapping
+
 
   L2_Megatron_GPT_Skip_Train:
     needs: [cicd-test-container-setup]
@@ -5199,7 +5196,7 @@ jobs:
       #- OPTIONAL_L0_Unit_Tests_GPU_Lightning
       - L0_Unit_Tests_GPU_Others
 
-      #- OPTIONAL_L0_Unit_Tests_CPU_ASR
+      - L0_Unit_Tests_CPU_ASR
       - L0_Unit_Tests_CPU_Audio
       - L0_Unit_Tests_CPU_Common
       - L0_Unit_Tests_CPU_LLM

diff --git a/nemo/collections/diffusion/models/__init__.py b/nemo/collections/diffusion/models/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/models/dit/__init__.py b/nemo/collections/diffusion/models/dit/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/diffusion/models/dit/dit_embeddings.py b/nemo/collections/diffusion/models/dit/dit_embeddings.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+from typing import Dict, Literal, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from diffusers.models.embeddings import TimestepEmbedding, get_3d_sincos_pos_embed
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from megatron.core import parallel_state
+from megatron.core.models.common.embeddings.rotary_pos_embedding import get_pos_emb_on_this_cp_rank
+from megatron.core.transformer.module import MegatronModule
+from torch import nn
+
+
+class ParallelTimestepEmbedding(TimestepEmbedding):
+    """
+    ParallelTimestepEmbedding is a subclass of TimestepEmbedding that initializes
+    the embedding layers with an optional random seed for syncronization.
+
+    Args:
+        in_channels (int): Number of input channels.
+        time_embed_dim (int): Dimension of the time embedding.
+        seed (int, optional): Random seed for initializing the embedding layers.
+                              If None, no specific seed is set.
+
+    Attributes:
+        linear_1 (nn.Module): First linear layer for the embedding.
+        linear_2 (nn.Module): Second linear layer for the embedding.
+
+    Methods:
+        __init__(in_channels, time_embed_dim, seed=None): Initializes the embedding layers.
+    """
+
+    def __init__(self, in_channels: int, time_embed_dim: int, seed=None):
+        super().__init__(in_channels=in_channels, time_embed_dim=time_embed_dim)
+        if seed is not None:
+            with torch.random.fork_rng():
+                torch.manual_seed(seed)
+                self.linear_1.reset_parameters()
+                self.linear_2.reset_parameters()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Computes the positional embeddings for the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (B, T, H, W, C).
+
+        Returns:
+            torch.Tensor: Positional embeddings of shape (B, T, H, W, C).
+        """
+        return super().forward(x.to(torch.bfloat16, non_blocking=True))
+
+
+def get_pos_emb_on_this_cp_rank(pos_emb, seq_dim):
+    """
+    Adjusts the positional embeddings tensor to the current context parallel rank.
+
+    Args:
+        pos_emb (torch.Tensor): The positional embeddings tensor.
+        seq_dim (int): The sequence dimension index in the positional embeddings tensor.
+
+    Returns:
+        torch.Tensor: The adjusted positional embeddings tensor for the current context parallel rank.
+    """
+    cp_size = parallel_state.get_context_parallel_world_size()
+    cp_rank = parallel_state.get_context_parallel_rank()
+    cp_idx = torch.tensor([cp_rank], device="cpu", pin_memory=True).cuda(non_blocking=True)
+    pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], cp_size, -1, *pos_emb.shape[(seq_dim + 1) :])
+    pos_emb = pos_emb.index_select(seq_dim, cp_idx)
+    pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], -1, *pos_emb.shape[(seq_dim + 2) :])
+    return pos_emb
+
+
+class SinCosPosEmb3D(nn.Module):
+    """
+    SinCosPosEmb3D is a 3D sine-cosine positional embedding module.
+
+    Args:
+        model_channels (int): Number of channels in the model.
+        h (int): Length of the height dimension.
+        w (int): Length of the width dimension.
+        t (int): Length of the temporal dimension.
+        spatial_interpolation_scale (float, optional): Scale factor for spatial interpolation. Default is 1.0.
+        temporal_interpolation_scale (float, optional): Scale factor for temporal interpolation. Default is 1.0.
+
+    Methods:
+        forward(x: torch.Tensor) -> torch.Tensor:
+            Computes the positional embeddings for the input tensor.
+
+            Args:
+                x (torch.Tensor): Input tensor of shape (B, T, H, W, C).
+
+            Returns:
+                torch.Tensor: Positional embeddings of shape (1, T, H, W, C).
+    """
+
+    def __init__(
+        self,
+        *,
+        model_channels: int,
+        h: int,
+        w: int,
+        t: int,
+        spatial_interpolation_scale=1.0,
+        temporal_interpolation_scale=1.0,
+    ):
+        super().__init__()
+        param = get_3d_sincos_pos_embed(
+            model_channels, [h, w], t, spatial_interpolation_scale, temporal_interpolation_scale
+        )
+        param = rearrange(param, "(b t) (h w) c -> b c t h w", h=h, w=w, b=1)
+        self.register_buffer("pos_embedding", torch.from_numpy(param).float(), persistent=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, C, T, H, W = x.shape
+        cp_size = parallel_state.get_context_parallel_world_size()
+        embeddings = self.pos_embedding[..., : T * cp_size, :H, :W]
+        embeddings = get_pos_emb_on_this_cp_rank(embeddings, seq_dim=2)
+        return embeddings
+
+
+class FactorizedLearnable3DEmbedding(MegatronModule):
+    def __init__(
+        self,
+        config,
+        t: int,
+        h: int,
+        w: int,
+        **kwargs,
+    ):
+        super().__init__(config=config)
+        self.emb_t = torch.nn.Embedding(t, config.hidden_size)
+        self.emb_h = torch.nn.Embedding(h, config.hidden_size)
+        self.emb_w = torch.nn.Embedding(w, config.hidden_size)
+
+        if config.perform_initialization:
+            config.init_method(self.emb_t.weight)
+            config.init_method(self.emb_h.weight)
+            config.init_method(self.emb_w.weight)
+
+    def forward(self, pos_ids: torch.Tensor):
+        return self.emb_t(pos_ids[..., 0]) + self.emb_h(pos_ids[..., 1]) + self.emb_w(pos_ids[..., 2])