Skip to content

Commit

Permalink
divided parallel ci tests to reduce memory usage (NVIDIA#4600)
Browse files Browse the repository at this point in the history
Signed-off-by: Ameya Mahabaleshwarkar <ameyasm1154@gmail.com>

Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: David Mosallanezhad <dmosallanezh@nvidia.com>
  • Loading branch information
2 people authored and Davood-M committed Aug 9, 2022
1 parent e8018f1 commit 0aa33d5
Showing 1 changed file with 45 additions and 21 deletions.
66 changes: 45 additions & 21 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -1181,7 +1181,7 @@ pipeline {
}
}

stage('L2: Parallel BERT/BART/GPT2 Question-Answering SQUAD v1.1 & v2.0') {
stage('L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0') {
when {
anyOf {
branch 'main'
Expand Down Expand Up @@ -1215,33 +1215,41 @@ pipeline {
exp_manager=null && TRANSFORMERS_OFFLINE=1'
}
}
stage('BART SQUAD 1.1') {
stage('BERT SQUAD 2.0') {
// Cannot do fast_dev_run because squad needs whole dev dataset
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
model.dataset.use_cache=false \
model.dataset.check_if_answer_in_context=false \
model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
model.train_ds.batch_size=2 \
model.train_ds.num_samples=2 \
model.validation_ds.batch_size=2 \
model.validation_ds.num_samples=2 \
model.test_ds.num_samples=2 \
model.test_ds.batch_size=2 \
trainer.max_epochs=1 \
trainer.max_steps=1 \
model.language_model.pretrained_model_name=facebook/bart-base \
model.dataset.version_2_with_negative=false \
model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
model.language_model.pretrained_model_name=bert-base-uncased \
model.dataset.version_2_with_negative=true \
trainer.precision=16 \
trainer.devices=[0] \
trainer.devices=[1] \
trainer.accelerator="gpu" \
exp_manager=null && TRANSFORMERS_OFFLINE=1'
}
}
stage('GPT2 SQUAD 1.1') {
}
}

stage('L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
parallel {
stage('BART SQUAD 1.1') {
// Cannot do fast_dev_run because squad needs whole dev dataset
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \
Expand All @@ -1259,55 +1267,71 @@ pipeline {
model.test_ds.batch_size=2 \
trainer.max_epochs=1 \
trainer.max_steps=1 \
model.language_model.pretrained_model_name=gpt2 \
model.language_model.pretrained_model_name=facebook/bart-base \
model.dataset.version_2_with_negative=false \
trainer.precision=16 \
trainer.devices=[0] \
trainer.accelerator="gpu" \
exp_manager=null && TRANSFORMERS_OFFLINE=1'
}
}
stage('BERT SQUAD 2.0') {
stage('BART SQUAD 2.0') {
// Cannot do fast_dev_run because squad needs whole dev dataset
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
model.dataset.use_cache=false \
model.dataset.check_if_answer_in_context=false \
model.train_ds.batch_size=2 \
model.train_ds.num_samples=2 \
model.validation_ds.batch_size=2 \
model.validation_ds.num_samples=2 \
trainer.max_epochs=1 \
trainer.max_steps=1 \
model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
model.language_model.pretrained_model_name=bert-base-uncased \
model.language_model.pretrained_model_name=facebook/bart-base \
model.dataset.version_2_with_negative=true \
trainer.precision=16 \
trainer.devices=[1] \
trainer.accelerator="gpu" \
exp_manager=null && TRANSFORMERS_OFFLINE=1'
}
}
stage('BART SQUAD 2.0') {
}
}

stage('L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
parallel {
stage('GPT2 SQUAD 1.1') {
// Cannot do fast_dev_run because squad needs whole dev dataset
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \
python question_answering.py \
model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
model.dataset.use_cache=false \
model.dataset.check_if_answer_in_context=false \
model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
model.train_ds.batch_size=2 \
model.train_ds.num_samples=2 \
model.validation_ds.batch_size=2 \
model.validation_ds.num_samples=2 \
model.test_ds.num_samples=2 \
model.test_ds.batch_size=2 \
trainer.max_epochs=1 \
trainer.max_steps=1 \
model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
model.language_model.pretrained_model_name=facebook/bart-base \
model.dataset.version_2_with_negative=true \
model.language_model.pretrained_model_name=gpt2 \
model.dataset.version_2_with_negative=false \
trainer.precision=16 \
trainer.devices=[1] \
trainer.devices=[0] \
trainer.accelerator="gpu" \
exp_manager=null && TRANSFORMERS_OFFLINE=1'
}
Expand Down

0 comments on commit 0aa33d5

Please sign in to comment.