Skip to content

Commit

Permalink
diffusion training
Browse files Browse the repository at this point in the history
  • Loading branch information
zpx01 committed Oct 10, 2024
1 parent 5a173a6 commit 4e22d4e
Show file tree
Hide file tree
Showing 19 changed files with 2,713 additions and 15 deletions.
14 changes: 12 additions & 2 deletions .github/workflows/_test_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ jobs:
run: |
docker pull nemoci.azurecr.io/nemo_container_${{ github.run_id }}
- name: Start container
run: |
docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
- id: main
name: Run main script
timeout-minutes: ${{ inputs.TIMEOUT }}
Expand All @@ -59,7 +63,7 @@ jobs:
(
set -e
docker run --rm --runtime=nvidia --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.SCRIPT }}'
docker exec nemo_container_${{ github.run_id }} bash -c '${{ inputs.SCRIPT }}'
) 2> >(tee err.log)
EXIT_CODE=$?
Expand All @@ -73,4 +77,10 @@ jobs:
- name: after_script
if: always() && inputs.AFTER_SCRIPT != ':'
run: |
docker run --rm --runtime=nvidia --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
docker exec nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
- name: Container shutdown
if: always()
run: |
docker container stop nemo_container_${{ github.run_id }} || true
docker container rm nemo_container_${{ github.run_id }} || true
23 changes: 10 additions & 13 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -249,16 +249,15 @@ jobs:
--ignore=tests/utils
# L0: CPU unit tests
OPTIONAL_L0_Unit_Tests_CPU_ASR:
L0_Unit_Tests_CPU_ASR:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_CPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-cpu
TIMEOUT: 20
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true
L0_Unit_Tests_CPU_Audio:
needs: [cicd-test-container-setup]
Expand Down Expand Up @@ -2814,7 +2813,7 @@ jobs:
trainer.accumulate_grad_batches=1 \
trainer.max_steps=3 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.exp_dir=/tmp/examples_gpt_pretrain_results_te_autocast \
++model.transformer_engine=True \
++model.fp8=True \
++model.fp8_hybrid=True \
Expand All @@ -2838,15 +2837,15 @@ jobs:
model.bias_dropout_add_fusion=False \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.num_layers=2 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.validation_drop_last=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
model.data.index_mapping_dir=/tmp/examples_gpt_pretrain_results_te_autocast/gpt_mapping
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
Expand All @@ -2857,7 +2856,7 @@ jobs:
trainer.accumulate_grad_batches=1 \
trainer.max_steps=6 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.exp_dir=/tmp/examples_gpt_pretrain_results_te_autocast \
exp_manager.resume_if_exists=True \
++model.transformer_engine=True \
++model.fp8=True \
Expand All @@ -2882,18 +2881,16 @@ jobs:
model.bias_dropout_add_fusion=False \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.num_layers=2 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.validation_drop_last=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
rm -rf examples/nlp/language_modeling/gpt_pretrain_results
rm -rf examples/nlp/language_modeling/gpt_index_mappings
model.data.index_mapping_dir=/tmp/examples_gpt_pretrain_results_te_autocast/gpt_mapping
L2_Megatron_GPT_Skip_Train:
needs: [cicd-test-container-setup]
Expand Down Expand Up @@ -5199,7 +5196,7 @@ jobs:
#- OPTIONAL_L0_Unit_Tests_GPU_Lightning
- L0_Unit_Tests_GPU_Others

#- OPTIONAL_L0_Unit_Tests_CPU_ASR
- L0_Unit_Tests_CPU_ASR
- L0_Unit_Tests_CPU_Audio
- L0_Unit_Tests_CPU_Common
- L0_Unit_Tests_CPU_LLM
Expand Down
13 changes: 13 additions & 0 deletions nemo/collections/diffusion/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
13 changes: 13 additions & 0 deletions nemo/collections/diffusion/models/dit/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
159 changes: 159 additions & 0 deletions nemo/collections/diffusion/models/dit/dit_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import math
from typing import Dict, Literal, Optional

import numpy as np
import torch
import torch.nn.functional as F
from diffusers.models.embeddings import TimestepEmbedding, get_3d_sincos_pos_embed
from einops import rearrange
from einops.layers.torch import Rearrange
from megatron.core import parallel_state
from megatron.core.models.common.embeddings.rotary_pos_embedding import get_pos_emb_on_this_cp_rank
from megatron.core.transformer.module import MegatronModule
from torch import nn


class ParallelTimestepEmbedding(TimestepEmbedding):
"""
ParallelTimestepEmbedding is a subclass of TimestepEmbedding that initializes
the embedding layers with an optional random seed for syncronization.
Args:
in_channels (int): Number of input channels.
time_embed_dim (int): Dimension of the time embedding.
seed (int, optional): Random seed for initializing the embedding layers.
If None, no specific seed is set.
Attributes:
linear_1 (nn.Module): First linear layer for the embedding.
linear_2 (nn.Module): Second linear layer for the embedding.
Methods:
__init__(in_channels, time_embed_dim, seed=None): Initializes the embedding layers.
"""

def __init__(self, in_channels: int, time_embed_dim: int, seed=None):
super().__init__(in_channels=in_channels, time_embed_dim=time_embed_dim)
if seed is not None:
with torch.random.fork_rng():
torch.manual_seed(seed)
self.linear_1.reset_parameters()
self.linear_2.reset_parameters()

def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Computes the positional embeddings for the input tensor.
Args:
x (torch.Tensor): Input tensor of shape (B, T, H, W, C).
Returns:
torch.Tensor: Positional embeddings of shape (B, T, H, W, C).
"""
return super().forward(x.to(torch.bfloat16, non_blocking=True))


def get_pos_emb_on_this_cp_rank(pos_emb, seq_dim):
"""
Adjusts the positional embeddings tensor to the current context parallel rank.
Args:
pos_emb (torch.Tensor): The positional embeddings tensor.
seq_dim (int): The sequence dimension index in the positional embeddings tensor.
Returns:
torch.Tensor: The adjusted positional embeddings tensor for the current context parallel rank.
"""
cp_size = parallel_state.get_context_parallel_world_size()
cp_rank = parallel_state.get_context_parallel_rank()
cp_idx = torch.tensor([cp_rank], device="cpu", pin_memory=True).cuda(non_blocking=True)
pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], cp_size, -1, *pos_emb.shape[(seq_dim + 1) :])
pos_emb = pos_emb.index_select(seq_dim, cp_idx)
pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], -1, *pos_emb.shape[(seq_dim + 2) :])
return pos_emb


class SinCosPosEmb3D(nn.Module):
"""
SinCosPosEmb3D is a 3D sine-cosine positional embedding module.
Args:
model_channels (int): Number of channels in the model.
h (int): Length of the height dimension.
w (int): Length of the width dimension.
t (int): Length of the temporal dimension.
spatial_interpolation_scale (float, optional): Scale factor for spatial interpolation. Default is 1.0.
temporal_interpolation_scale (float, optional): Scale factor for temporal interpolation. Default is 1.0.
Methods:
forward(x: torch.Tensor) -> torch.Tensor:
Computes the positional embeddings for the input tensor.
Args:
x (torch.Tensor): Input tensor of shape (B, T, H, W, C).
Returns:
torch.Tensor: Positional embeddings of shape (1, T, H, W, C).
"""

def __init__(
self,
*,
model_channels: int,
h: int,
w: int,
t: int,
spatial_interpolation_scale=1.0,
temporal_interpolation_scale=1.0,
):
super().__init__()
param = get_3d_sincos_pos_embed(
model_channels, [h, w], t, spatial_interpolation_scale, temporal_interpolation_scale
)
param = rearrange(param, "(b t) (h w) c -> b c t h w", h=h, w=w, b=1)
self.register_buffer("pos_embedding", torch.from_numpy(param).float(), persistent=False)

def forward(self, x: torch.Tensor) -> torch.Tensor:
B, C, T, H, W = x.shape
cp_size = parallel_state.get_context_parallel_world_size()
embeddings = self.pos_embedding[..., : T * cp_size, :H, :W]
embeddings = get_pos_emb_on_this_cp_rank(embeddings, seq_dim=2)
return embeddings


class FactorizedLearnable3DEmbedding(MegatronModule):
def __init__(
self,
config,
t: int,
h: int,
w: int,
**kwargs,
):
super().__init__(config=config)
self.emb_t = torch.nn.Embedding(t, config.hidden_size)
self.emb_h = torch.nn.Embedding(h, config.hidden_size)
self.emb_w = torch.nn.Embedding(w, config.hidden_size)

if config.perform_initialization:
config.init_method(self.emb_t.weight)
config.init_method(self.emb_h.weight)
config.init_method(self.emb_w.weight)

def forward(self, pos_ids: torch.Tensor):
return self.emb_t(pos_ids[..., 0]) + self.emb_h(pos_ids[..., 1]) + self.emb_w(pos_ids[..., 2])
Loading

0 comments on commit 4e22d4e

Please sign in to comment.