From 211e68ed2aefa44bdd32425d38dd81b2d3c841e8 Mon Sep 17 00:00:00 2001 From: arendu Date: Wed, 9 Nov 2022 13:53:52 -0800 Subject: [PATCH 1/5] fix for num worker 0 causing issues in losses after 1 epoch Signed-off-by: arendu --- .../megatron/gpt_prompt_learning_dataset.py | 7 ++++++- .../megatron/t5_prompt_learning_dataset.py | 9 +++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py index 3d0c29673c83..2c23b2468585 100755 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import json import os import pickle @@ -319,7 +320,11 @@ def __getitem__(self, idx): def collate_fn(self, batch, tp_workers=0): """ Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch """ - taskname_ids, input_ids, answer_starts = zip(*batch) + orig_taskname_ids, orig_input_ids, orig_answer_starts = zip(*batch) + taskname_ids = copy.deepcopy(orig_taskname_ids) + input_ids = copy.deepcopy(orig_input_ids) + answer_starts = copy.deepcopy(orig_answer_starts) + # Pad taskname_ids to be the same length for the prompt encoder if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER: diff --git a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py index 0f39cd8e05c9..0d1521decda7 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import enum +import copy import json import torch @@ -195,7 +195,12 @@ def _insert_text_in_template(self, input_example, prompt_template_fields, doc, a def collate_fn(self, batch): """ Prepares enc_input, dec_input, labels, loss_mask, enc_mask, dec_mask, position_ids, taskname_ids for global batch """ - taskname_ids, enc_input, dec_input, dec_labels = zip(*batch) + orig_taskname_ids, orig_enc_input, orig_dec_input, orig_dec_labels = zip(*batch) + taskname_ids = copy.deepcopy(orig_taskname_ids) + enc_input = copy.deepcopy(orig_enc_input) + dec_input = copy.deepcopy(orig_dec_input) + dec_labels = copy.deepcopy(orig_dec_labels) + taskname_ids = self.pad_taskname_ids(taskname_ids) From 467af915a72b159c7666809ccc29afd165177a67 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Nov 2022 21:56:43 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../language_modeling/megatron/gpt_prompt_learning_dataset.py | 1 - .../language_modeling/megatron/t5_prompt_learning_dataset.py | 1 - 2 files changed, 2 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py index 2c23b2468585..411e807ca605 100755 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py @@ -325,7 +325,6 @@ def collate_fn(self, batch, tp_workers=0): input_ids = copy.deepcopy(orig_input_ids) answer_starts = copy.deepcopy(orig_answer_starts) - # Pad taskname_ids to be the same length for the prompt encoder if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER: max_taskname_length = max(len(ids) for ids in taskname_ids) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py index 0d1521decda7..3a6cf921193f 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py @@ -200,7 +200,6 @@ def collate_fn(self, batch): enc_input = copy.deepcopy(orig_enc_input) dec_input = copy.deepcopy(orig_dec_input) dec_labels = copy.deepcopy(orig_dec_labels) - taskname_ids = self.pad_taskname_ids(taskname_ids) From 855a3b83f1acac088ebd9dd389bb741143c421cb Mon Sep 17 00:00:00 2001 From: arendu Date: Wed, 9 Nov 2022 15:08:56 -0800 Subject: [PATCH 3/5] fix .extend which is an inplace op Signed-off-by: arendu --- .../megatron/gpt_prompt_learning_dataset.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py index 2c23b2468585..189b99307cbd 100755 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy import json import os import pickle @@ -320,10 +319,7 @@ def __getitem__(self, idx): def collate_fn(self, batch, tp_workers=0): """ Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch """ - orig_taskname_ids, orig_input_ids, orig_answer_starts = zip(*batch) - taskname_ids = copy.deepcopy(orig_taskname_ids) - input_ids = copy.deepcopy(orig_input_ids) - answer_starts = copy.deepcopy(orig_answer_starts) + taskname_ids, input_ids, answer_starts = zip(*batch) # Pad taskname_ids to be the same length for the prompt encoder @@ -380,7 +376,8 @@ def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts): # Pad to max length input_length = len(ids) padding_length = batch_max - input_length - ids.extend([self.pad_token_id] * padding_length) + pad_extend = [self.pad_token_id] * padding_length + ids = ids + pad_extend # Account for padding in loss mask loss_mask.extend([0.0] * padding_length) From f04ab5ee85842e4b826ce8ef32e0d8545ccf0b91 Mon Sep 17 00:00:00 2001 From: arendu Date: Wed, 9 Nov 2022 15:10:20 -0800 Subject: [PATCH 4/5] deepcopy not needed in t5 Signed-off-by: arendu --- .../megatron/t5_prompt_learning_dataset.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py index 3a6cf921193f..2858d9d183df 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy import json import torch @@ -195,11 +194,7 @@ def _insert_text_in_template(self, input_example, prompt_template_fields, doc, a def collate_fn(self, batch): """ Prepares enc_input, dec_input, labels, loss_mask, enc_mask, dec_mask, position_ids, taskname_ids for global batch """ - orig_taskname_ids, orig_enc_input, orig_dec_input, orig_dec_labels = zip(*batch) - taskname_ids = copy.deepcopy(orig_taskname_ids) - enc_input = copy.deepcopy(orig_enc_input) - dec_input = copy.deepcopy(orig_dec_input) - dec_labels = copy.deepcopy(orig_dec_labels) + taskname_ids, enc_input, dec_input, dec_labels = zip(*batch) taskname_ids = self.pad_taskname_ids(taskname_ids) From 33a34724553319a43206e5294f97d811898b996d Mon Sep 17 00:00:00 2001 From: arendu Date: Wed, 9 Nov 2022 15:29:52 -0800 Subject: [PATCH 5/5] removed extend inplace op Signed-off-by: arendu --- .../megatron/gpt_prompt_learning_dataset.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py index a78e917a5aa8..69cd485b0ca5 100755 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py @@ -364,6 +364,7 @@ def collate_fn(self, batch, tp_workers=0): def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts): """ Pad input_ids in batch to max batch length while building loss mask """ batch_loss_masks = [] + padded_input_ids = [] for ids, answer_start_idx in zip(input_ids, answer_starts): if answer_start_idx is not None: # Loss mask where answer tokens are 1.0 and all other tokens are 0.0 @@ -377,16 +378,17 @@ def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts): padding_length = batch_max - input_length pad_extend = [self.pad_token_id] * padding_length ids = ids + pad_extend + padded_input_ids.append(ids) # Account for padding in loss mask loss_mask.extend([0.0] * padding_length) batch_loss_masks.append(torch.tensor(loss_mask, dtype=torch.float)) # Make into torch tensors - input_ids = torch.tensor(input_ids, dtype=torch.long) + padded_input_ids = torch.tensor(padded_input_ids, dtype=torch.long) batch_loss_masks = torch.stack(batch_loss_masks) - return input_ids, batch_loss_masks + return padded_input_ids, batch_loss_masks def inference_collate_fn(self, batch): """