Skip to content

Commit

Permalink
Merge branch 'main' into dev-yenshiw-te-fp8-inference
Browse files Browse the repository at this point in the history
  • Loading branch information
yen-shi authored May 23, 2023
2 parents 80d43bb + efec347 commit beb15c3
Show file tree
Hide file tree
Showing 128 changed files with 4,952 additions and 621 deletions.
1 change: 0 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,4 @@ repos:
hooks:
- id: black
name: Format code
args: [--skip-string-normalization, --line-length=119]
additional_dependencies: ['click==8.0.2']
9 changes: 2 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.03-py3
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.04-py3

# build an image that includes only the nemo dependencies, ensures that dependencies
# are included first for optimal caching, and useful for building a development
Expand Down Expand Up @@ -43,11 +43,6 @@ RUN apt-get update && \
rm -rf /var/lib/apt/lists/*

WORKDIR /workspace/
# Install Megatron-core
RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git checkout 9f8bdeb4814ed61fbc9c7d5b39c7710e77b99754 && \
pip install -e .

WORKDIR /tmp/
# TODO: Remove once this Apex commit (2/24/23) is included in PyTorch
Expand Down Expand Up @@ -94,7 +89,7 @@ COPY . .

# start building the final container
FROM nemo-deps as nemo
ARG NEMO_VERSION=1.18.0
ARG NEMO_VERSION=1.19.0

# Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container
# version information as runtime environment variable for introspection purposes
Expand Down
58 changes: 24 additions & 34 deletions Jenkinsfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
pipeline {
agent {
docker {
image 'pytorch_23.03:apex_57057e2fcf1c084c0fcc818f55c0ff6ea1b24ae2'
args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g'
image 'nvcr.io/nvidia/pytorch:23.04-py3'
args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=1'
}
}
options {
Expand Down Expand Up @@ -57,16 +57,6 @@ pipeline {
}
}

// TODO: remove when pip package is available
stage('Megatron Core installation') {
steps {
sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git checkout 9f8bdeb4814ed61fbc9c7d5b39c7710e77b99754 && \
pip install -e .'
}
}

stage('PyTorch Lightning version') {
steps {
sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"'
Expand Down Expand Up @@ -1014,7 +1004,7 @@ pipeline {
// TODO: pleasefixme @redoctopus
// stage('ByT5G2P training, evaluation and inference') {
// steps {
// sh 'TRANSFORMERS_OFFLINE=0 && cd examples/tts/g2p && \
// sh 'TRANSFORMERS_OFFLINE=1 && cd examples/tts/g2p && \
// TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \
// python g2p_train_and_evaluate.py \
// train_manifest=/home/TestData/g2p/g2p.json \
Expand Down Expand Up @@ -1158,7 +1148,7 @@ pipeline {
parallel {
stage('Dialogue: Intent and slot classification using GPT') {
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \
python dialogue.py \
model.dataset.data_dir=/home/TestData/nlp/sgd_small \
model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\
Expand All @@ -1185,7 +1175,7 @@ pipeline {
}
stage('Intent and slot classification using SGDQA') {
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \
python dialogue.py \
model.dataset.data_dir=/home/TestData/nlp/sgd_small \
model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \
Expand All @@ -1208,7 +1198,7 @@ pipeline {
}
stage('Intent and slot classification using IntentSlotClassificationModel') {
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \
python dialogue.py \
model.dataset.data_dir=/home/TestData/nlp/processed_assistant \
model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \
Expand All @@ -1230,7 +1220,7 @@ pipeline {
}
stage('Intent classification using ZeroShotIntentModel') {
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \
python dialogue.py \
do_training=False \
model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \
Expand All @@ -1255,7 +1245,7 @@ pipeline {
}
stage('Design Intent classification using ZeroShotIntentModel') {
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \
python dialogue.py \
do_training=False \
model.dataset.data_dir=/home/TestData/nlp/design_dataset \
Expand All @@ -1281,7 +1271,7 @@ pipeline {
}
stage('Design Intent classification using ZeroShotIntentModel BART Classifier') {
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \
python dialogue.py \
do_training=False \
model.dataset.data_dir=/home/TestData/nlp/design_dataset \
Expand All @@ -1300,7 +1290,7 @@ pipeline {
}
stage('Design Intent classification using DialogueNearestNeighbourModel') {
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \
python dialogue.py \
do_training=False \
model.dataset.data_dir=/home/TestData/nlp/design_dataset \
Expand Down Expand Up @@ -1329,7 +1319,7 @@ pipeline {
parallel {
stage('Dialogue: Answer Extender using DialogueS2SGenerationModel') {
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \
python dialogue.py \
do_training=False \
model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
Expand All @@ -1354,7 +1344,7 @@ pipeline {
}
stage('Dialogue: SGD Based Answer Extender using DialogueS2SGenerationModel') {
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \
python dialogue.py \
do_training=False \
model.dataset.data_dir=/home/TestData/nlp/sgd_small \
Expand Down Expand Up @@ -1395,7 +1385,7 @@ pipeline {
// parallel {
// stage('Dialogue: Answer Extender using DialogueGPTGenerationModel') {
// steps {
// sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \
// sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \
// python dialogue.py \
// do_training=False \
// model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
Expand Down Expand Up @@ -1425,7 +1415,7 @@ pipeline {
parallel {
stage('Dialogue: Answer Extender using DialogueGPTGenerationModel') {
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \
python dialogue.py \
do_training=False \
model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
Expand Down Expand Up @@ -1549,7 +1539,7 @@ pipeline {
stage('BERT SQUAD 1.1') {
// Cannot do fast_dev_run because squad needs whole dev dataset
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
model.dataset.use_cache=false \
Expand All @@ -1574,7 +1564,7 @@ pipeline {
stage('BERT SQUAD 2.0') {
// Cannot do fast_dev_run because squad needs whole dev dataset
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
model.dataset.use_cache=false \
Expand Down Expand Up @@ -1608,7 +1598,7 @@ pipeline {
stage('BART SQUAD 1.1') {
// Cannot do fast_dev_run because squad needs whole dev dataset
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
model.dataset.use_cache=false \
Expand All @@ -1634,7 +1624,7 @@ pipeline {
stage('BART SQUAD 2.0') {
// Cannot do fast_dev_run because squad needs whole dev dataset
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
model.dataset.use_cache=false \
Expand Down Expand Up @@ -1669,7 +1659,7 @@ pipeline {
stage('GPT2 SQUAD 1.1') {
// Cannot do fast_dev_run because squad needs whole dev dataset
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
model.dataset.use_cache=false \
Expand All @@ -1695,7 +1685,7 @@ pipeline {
stage('GPT2 SQUAD 2.0') {
// Cannot do fast_dev_run because squad needs whole dev dataset
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \
sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
model.dataset.use_cache=false \
Expand Down Expand Up @@ -3809,8 +3799,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
stage('L2: Megatron T5 with KERPLE Pretraining and Resume Training TP=2') {
when {
anyOf {
branch 'r1.18.0'
changeRequest target: 'r1.18.0'
branch 'main'
changeRequest target: 'main'
}
}
failFast true
Expand Down Expand Up @@ -4016,7 +4006,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
sh "rm -rf examples/nlp/language_modeling/t5_index_mappings"
}
}

stage('L2: Megatron T5 Prompt Learning TP1 PP1') {
when {
anyOf {
Expand Down Expand Up @@ -4101,7 +4091,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
}
}
}

// TODO: add when https://github.com/NVIDIA/apex/pull/1596 is merged
// stage('L2: Megatron T5 Prompt Learning TP1 PP2') {
// when {
Expand Down
21 changes: 6 additions & 15 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,8 @@ Note that RNNT requires numba to be installed from conda.
NeMo Megatron
~~~~~~~~~~~~~
NeMo Megatron training requires NVIDIA Apex and Megatron-core to be installed.
Install them manually if not using the NVIDIA PyTorch container.
NeMo Megatron training requires NVIDIA Apex to be installed.
Install it manually if not using the NVIDIA PyTorch container.

To install Apex, run

Expand All @@ -248,15 +248,6 @@ To install Apex, run
git checkout 57057e2fcf1c084c0fcc818f55c0ff6ea1b24ae2
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./
To install Megatron-core, run

.. code-block:: bash
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout 3db2063b1ff992a971ba18f7101eecc9c4e90f03
pip install -e .
It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Apex or any other dependencies.

While installing Apex, it may raise an error if the CUDA version on your system does not match the CUDA version torch was compiled with.
Expand Down Expand Up @@ -295,13 +286,13 @@ NeMo Text Processing, specifically (Inverse) Text Normalization, is now a separa

Docker containers:
~~~~~~~~~~~~~~~~~~
We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.16.0`` comes with container ``nemo:23.01``, you may find more details about released containers in `releases page <https://github.com/NVIDIA/NeMo/releases>`_.
We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.18.1`` comes with container ``nemo:23.03``, you may find more details about released containers in `releases page <https://github.com/NVIDIA/NeMo/releases>`_.

To use built container, please run

.. code-block:: bash
docker pull nvcr.io/nvidia/nemo:23.01
docker pull nvcr.io/nvidia/nemo:23.03
To build a nemo container with Dockerfile from a branch, please run

Expand All @@ -310,13 +301,13 @@ To build a nemo container with Dockerfile from a branch, please run
DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest .
If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.03-py3 and then installing from GitHub.
If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.04-py3 and then installing from GitHub.

.. code-block:: bash
docker run --gpus all -it --rm -v <nemo_github_folder>:/NeMo --shm-size=8g \
-p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \
stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.03-py3
stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.04-py3
Examples
--------
Expand Down
11 changes: 6 additions & 5 deletions docs/source/asr/configs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -885,17 +885,17 @@ Hybrid ASR-TTS Model Configuration

:ref:`Hybrid ASR-TTS model <Hybrid-ASR-TTS_model>` consists of three parts:

* ASR model (``EncDecCTCModelBPE`` or ``EncDecRNNTBPEModel``)
* ASR model (``EncDecCTCModelBPE``, ``EncDecRNNTBPEModel`` or ``EncDecHybridRNNTCTCBPEModel``)
* TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch <FastPitch_model>` model is supported)
* Enhancer model (optional)
* :ref:`Enhancer model <SpectrogramEnhancer_model>` (optional)

Also, the config allows to specify :ref:`text-only dataset <Hybrid-ASR-TTS_model__Text-Only-Data>`.

Main parts of the config:

* ASR model
* ``asr_model_path``: path to the ASR model checkpoint (`.nemo`) file, loaded only once, then the config of the ASR model is stored in the ``asr_model`` field
* ``asr_model_type``: needed only when training from scratch, ``rnnt_bpe`` corresponds to ``EncDecRNNTBPEModel``, ``ctc_bpe`` to ``EncDecCTCModelBPE``
* ``asr_model_type``: needed only when training from scratch. ``rnnt_bpe`` corresponds to ``EncDecRNNTBPEModel``, ``ctc_bpe`` to ``EncDecCTCModelBPE``, ``hybrid_rnnt_ctc_bpe`` to ``EncDecHybridRNNTCTCBPEModel``
* ``asr_model_fuse_bn``: fusing BatchNorm in the pretrained ASR model, can improve quality in finetuning scenario
* TTS model
* ``tts_model_path``: path to the pretrained TTS model checkpoint (`.nemo`) file, loaded only once, then the config of the model is stored in the ``tts_model`` field
Expand All @@ -907,7 +907,7 @@ Main parts of the config:
* ``speakers_filepath``: path (or paths) to the text file containing speaker ids for the multi-speaker TTS model (speakers are sampled randomly during training)
* ``min_words`` and ``max_words``: parameters to filter text-only manifests by the number of words
* ``tokenizer_workers``: number of workers for initial tokenization (when loading the data). ``num_CPUs / num_GPUs`` is a recommended value.
* ``asr_tts_sampling_technique``, ``asr_tts_sampling_temperature``, ``asr_tts_sampling_probabilities``: sampling parameters for text-only and audio-text data (if both specified). See parameters for ``nemo.collections.common.data.ConcatDataset``
* ``asr_tts_sampling_technique``, ``asr_tts_sampling_temperature``, ``asr_tts_sampling_probabilities``: sampling parameters for text-only and audio-text data (if both specified). Correspond to ``sampling_technique``, ``sampling_temperature``, and ``sampling_probabilities`` parameters of the :mod:`ConcatDataset <nemo.collections.common.data.dataset.ConcatDataset>`.
* all other components are similar to conventional ASR models
* ``validation_ds`` and ``test_ds`` correspond to the underlying ASR model

Expand All @@ -920,7 +920,7 @@ Main parts of the config:
# asr model
asr_model_path: ???
asr_model: null
asr_model_type: null # rnnt_bpe or ctc_bpe, needed only if instantiating from config, otherwise type is auto inferred
asr_model_type: null # rnnt_bpe, ctc_bpe or hybrid_rnnt_ctc_bpe; needed only if instantiating from config, otherwise type is auto inferred
asr_model_fuse_bn: false # only ConformerEncoder supported now, use false for other models
# tts model
Expand Down Expand Up @@ -972,6 +972,7 @@ Training from Scratch
To train ASR model from scratch using text-only data use ``<NeMo_git_root>/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py`` script with conventional ASR model config, e.g. ``<NeMo_git_root>/examples/asr/conf/conformer/conformer_ctc_bpe.yaml`` or ``<NeMo_git_root>/examples/asr/conf/conformer/conformer_transducer_bpe.yaml``
Please specify the ASR model type, paths to the TTS model, and (optional) enhancer, along with text-only data-related fields.
Use ``++`` or ``+`` markers for these options, since the options are not present in the original ASR model config.
.. code-block:: shell
Expand Down
Binary file added docs/source/asr/images/hat.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion docs/source/asr/models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ The model consists of three models:

* ASR model (``EncDecCTCModelBPE`` or ``EncDecRNNTBPEModel``)
* Frozen TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch <FastPitch_model>` model is supported)
* Optional frozen Enhancer model trained to mitigate mismatch between real and generated mel spectrogram
* Optional frozen :ref:`Spectrogram Enhancer model <SpectrogramEnhancer_model>` model trained to mitigate mismatch between real and generated mel spectrogram

.. image:: images/hybrid_asr_tts_model.png
:align: center
Expand Down
13 changes: 13 additions & 0 deletions docs/source/common/data.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Data
----

.. autoclass:: nemo.collections.common.data.dataset.ConcatDataset
:show-inheritance:
:members:
:undoc-members:


.. autoclass:: nemo.collections.common.data.dataset.ConcatMapDataset
:show-inheritance:
:members:
:undoc-members:
1 change: 1 addition & 0 deletions docs/source/common/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ The common collection contains things that could be used across all collections.
losses
metrics
tokenizers
data
Loading

0 comments on commit beb15c3

Please sign in to comment.