Skip to content

Commit

Permalink
Merge branch 'main' into opt_lora_dropout
Browse files Browse the repository at this point in the history
  • Loading branch information
guyueh1 authored Aug 28, 2024
2 parents 15e3844 + e68f981 commit 0b56ac9
Show file tree
Hide file tree
Showing 20 changed files with 1,057 additions and 68 deletions.
2 changes: 1 addition & 1 deletion docs/source/multimodal/text2img/sd.rst
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ Optimization related configurations
Training with precached latents
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Since the VAE and text encoder remain frozed during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``.
Since the VAE and text encoder remain frozen during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``.

Reference
-----------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@

import os
from argparse import ArgumentParser
from functools import cache

import inflect
import regex as re
from tqdm import tqdm

Expand All @@ -60,12 +60,21 @@
)
from nemo.utils import logging

engine = inflect.engine()

@cache
def inflect_engine():
import inflect

return inflect.engine()


# these are all words that can appear in a verbalized number, this list will be used later as a filter to detect numbers in verbalizations
number_verbalizations = list(range(0, 20)) + list(range(20, 100, 10))
number_verbalizations = (
[engine.number_to_words(x, zero="zero").replace("-", " ").replace(",", "") for x in number_verbalizations]
[
inflect_engine().number_to_words(x, zero="zero").replace("-", " ").replace(",", "")
for x in number_verbalizations
]
+ ["hundred", "thousand", "million", "billion", "trillion"]
+ ["point"]
)
Expand All @@ -85,7 +94,7 @@ def process_url(o):
"""

def flatten(l):
""" flatten a list of lists """
"""flatten a list of lists"""
return [item for sublist in l for item in sublist]

if o != '<self>' and '_letter' in o:
Expand Down Expand Up @@ -129,6 +138,7 @@ def convert2digits(digits: str):
Return:
res: number verbalization of the integer prefix of the input
"""
engine = inflect_engine()
res = []
for i, x in enumerate(digits):
if x in digit:
Expand All @@ -145,6 +155,7 @@ def convert2digits(digits: str):


def convert(example):
engine = inflect_engine()
cls, written, spoken = example

written = convert_fraction(written)
Expand Down Expand Up @@ -288,7 +299,7 @@ def convert(example):
def ignore(example):
"""
This function makes sure specific class types like 'PLAIN', 'ELECTRONIC' etc. are left unchanged.
Args:
example: data example
"""
Expand All @@ -300,7 +311,7 @@ def ignore(example):


def process_file(fp):
""" Reading the raw data from a file of NeMo format and preprocesses it. Write is out to the output directory.
"""Reading the raw data from a file of NeMo format and preprocesses it. Write is out to the output directory.
For more info about the data format, refer to the
`text_normalization doc <https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/text_normalization.rst>`.
Expand Down
16 changes: 12 additions & 4 deletions nemo/collections/common/parts/preprocessing/cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

import re

import inflect
from text_unidecode import unidecode

from nemo.utils import logging
Expand Down Expand Up @@ -139,7 +138,14 @@
]


inflect = inflect.engine()
from functools import cache


@cache
def inflect_engine():
import inflect

return inflect.engine()


def clean_text(string, table, punctuation_to_replace, abbreviation_version=None):
Expand Down Expand Up @@ -194,11 +200,12 @@ def reset(self):
self.currency = None

def format_final_number(self, whole_num, decimal):
inflect = inflect_engine()
if self.currency:
return_string = inflect.number_to_words(whole_num)
return_string += " dollar" if whole_num == 1 else " dollars"
if decimal:
return_string += " and " + inflect.number_to_words(decimal)
return_string += " and " + inflect_engine().number_to_words(decimal)
return_string += " cent" if whole_num == decimal else " cents"
self.reset()
return return_string
Expand All @@ -210,11 +217,12 @@ def format_final_number(self, whole_num, decimal):
else:
# Check if there are non-numbers
def convert_to_word(match):
return " " + inflect.number_to_words(match.group(0)) + " "
return " " + inflect_engine().number_to_words(match.group(0)) + " "

return re.sub(r'[0-9,]+', convert_to_word, whole_num)

def clean(self, match):
inflect = inflect_engine()
ws = match.group(2)
number = match.group(3)
_proceeding_symbol = match.group(7)
Expand Down
7 changes: 4 additions & 3 deletions nemo/collections/common/tokenizers/en_ja_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@
import re
from typing import List

from pangu import spacing
from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer

try:
import ipadic
import MeCab
Expand All @@ -36,6 +33,8 @@ class EnJaProcessor:
"""

def __init__(self, lang_id: str):
from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer

self.lang_id = lang_id
self.moses_tokenizer = MosesTokenizer(lang=lang_id)
self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
Expand Down Expand Up @@ -81,6 +80,8 @@ def __init__(self):
self.mecab_tokenizer = MeCab.Tagger(ipadic.MECAB_ARGS + " -Owakati")

def detokenize(self, text: List[str]) -> str:
from pangu import spacing

RE_WS_IN_FW = re.compile(
r'([\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])\s+(?=[\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])'
)
Expand Down
4 changes: 2 additions & 2 deletions nemo/collections/common/tokenizers/indic_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@

from typing import List

from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer


class IndicProcessor:
"""
Expand All @@ -26,6 +24,8 @@ class IndicProcessor:
def __init__(self, lang_id: str):
if lang_id != 'hi':
raise NotImplementedError
from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer

self.moses_tokenizer = MosesTokenizer(lang=lang_id)
self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
self.normalizer = MosesPunctNormalizer(lang=lang_id)
Expand Down
4 changes: 2 additions & 2 deletions nemo/collections/common/tokenizers/moses_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@

from typing import List

from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer


class MosesProcessor:
"""
Tokenizer, Detokenizer and Normalizer utilities in Moses
"""

def __init__(self, lang_id: str):
from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer

self.moses_tokenizer = MosesTokenizer(lang=lang_id)
self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
self.normalizer = MosesPunctNormalizer(lang=lang_id)
Expand Down
18 changes: 18 additions & 0 deletions nemo/collections/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
Llama2Config70B,
Llama3Config8B,
Llama3Config70B,
Llama31Config8B,
Llama31Config70B,
Llama31Config405B,
LlamaConfig,
LlamaModel,
MaskedTokenLossReduction,
Expand All @@ -55,6 +58,12 @@
Nemotron4Config340B,
NemotronConfig,
NemotronModel,
Qwen2Config,
Qwen2Config1P5B,
Qwen2Config7B,
Qwen2Config72B,
Qwen2Config500M,
Qwen2Model,
gpt_data_step,
gpt_forward_step,
)
Expand Down Expand Up @@ -93,6 +102,9 @@
"Llama2Config70B",
"Llama3Config8B",
"Llama3Config70B",
"Llama31Config8B",
"Llama31Config70B",
"Llama31Config405B",
"CodeLlamaConfig7B",
"CodeLlamaConfig13B",
"CodeLlamaConfig34B",
Expand All @@ -111,6 +123,12 @@
"ChatGLM2Config6B",
"ChatGLM3Config6B",
"ChatGLMModel",
"Qwen2Model",
"Qwen2Config7B",
"Qwen2Config",
"Qwen2Config500M",
"Qwen2Config1P5B",
"Qwen2Config72B",
"PreTrainingDataModule",
"FineTuningDataModule",
"SquadDataModule",
Expand Down
20 changes: 20 additions & 0 deletions nemo/collections/llm/gpt/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
Llama2Config70B,
Llama3Config8B,
Llama3Config70B,
Llama31Config8B,
Llama31Config70B,
Llama31Config405B,
LlamaConfig,
LlamaModel,
)
Expand All @@ -46,6 +49,14 @@
NemotronConfig,
NemotronModel,
)
from nemo.collections.llm.gpt.model.qwen2 import (
Qwen2Config,
Qwen2Config1P5B,
Qwen2Config7B,
Qwen2Config72B,
Qwen2Config500M,
Qwen2Model,
)

__all__ = [
"GPTConfig",
Expand All @@ -62,6 +73,9 @@
"Llama2Config70B",
"Llama3Config8B",
"Llama3Config70B",
"Llama31Config8B",
"Llama31Config70B",
"Llama31Config405B",
"NemotronConfig",
"Nemotron3Config4B",
"Nemotron3Config8B",
Expand All @@ -87,6 +101,12 @@
"ChatGLM2Config6B",
"ChatGLM3Config6B",
"ChatGLMModel",
"Qwen2Config",
"Qwen2Config500M",
"Qwen2Config1P5B",
"Qwen2Config7B",
"Qwen2Config72B",
"Qwen2Model",
"MaskedTokenLossReduction",
"gpt_data_step",
"gpt_forward_step",
Expand Down
12 changes: 11 additions & 1 deletion nemo/collections/llm/gpt/model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from nemo.lightning import get_vocab_size, io
from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule
from nemo.utils import logging

HAVE_TE = True
try:
Expand Down Expand Up @@ -131,10 +132,19 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
if not isinstance(transformer_layer_spec, ModuleSpec):
transformer_layer_spec = transformer_layer_spec(self)

if hasattr(self, 'vocab_size'):
vocab_size = self.vocab_size
logging.info(
f"Use preset vocab_size: {vocab_size}, original vocab_size: {tokenizer.vocab_size}, dummy tokens:"
f" {vocab_size - tokenizer.vocab_size}."
)
else:
vocab_size = get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by)

return MCoreGPTModel(
self,
transformer_layer_spec=transformer_layer_spec,
vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by),
vocab_size=vocab_size,
max_sequence_length=self.seq_length,
fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
parallel_output=self.parallel_output,
Expand Down
Loading

0 comments on commit 0b56ac9

Please sign in to comment.