From 211e68ed2aefa44bdd32425d38dd81b2d3c841e8 Mon Sep 17 00:00:00 2001
From: arendu <adithya.r@gmail.com>
Date: Wed, 9 Nov 2022 13:53:52 -0800
Subject: [PATCH 1/5] fix for num worker 0 causing issues in losses after 1
 epoch

Signed-off-by: arendu <adithya.r@gmail.com>
---
 .../megatron/gpt_prompt_learning_dataset.py              | 7 ++++++-
 .../megatron/t5_prompt_learning_dataset.py               | 9 +++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
index 3d0c29673c83..2c23b2468585 100755
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import json
 import os
 import pickle
@@ -319,7 +320,11 @@ def __getitem__(self, idx):
 
     def collate_fn(self, batch, tp_workers=0):
         """ Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch """
-        taskname_ids, input_ids, answer_starts = zip(*batch)
+        orig_taskname_ids, orig_input_ids, orig_answer_starts = zip(*batch)
+        taskname_ids = copy.deepcopy(orig_taskname_ids)
+        input_ids = copy.deepcopy(orig_input_ids)
+        answer_starts = copy.deepcopy(orig_answer_starts)
+
 
         # Pad taskname_ids to be the same length for the prompt encoder
         if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER:
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
index 0f39cd8e05c9..0d1521decda7 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import enum
+import copy
 import json
 
 import torch
@@ -195,7 +195,12 @@ def _insert_text_in_template(self, input_example, prompt_template_fields, doc, a
     def collate_fn(self, batch):
         """ Prepares enc_input, dec_input, labels, loss_mask, enc_mask, dec_mask, position_ids, taskname_ids for global batch """
 
-        taskname_ids, enc_input, dec_input, dec_labels = zip(*batch)
+        orig_taskname_ids, orig_enc_input, orig_dec_input, orig_dec_labels = zip(*batch)
+        taskname_ids = copy.deepcopy(orig_taskname_ids)
+        enc_input = copy.deepcopy(orig_enc_input)
+        dec_input = copy.deepcopy(orig_dec_input)
+        dec_labels = copy.deepcopy(orig_dec_labels)
+        
 
         taskname_ids = self.pad_taskname_ids(taskname_ids)
 

From 467af915a72b159c7666809ccc29afd165177a67 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 9 Nov 2022 21:56:43 +0000
Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../language_modeling/megatron/gpt_prompt_learning_dataset.py    | 1 -
 .../language_modeling/megatron/t5_prompt_learning_dataset.py     | 1 -
 2 files changed, 2 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
index 2c23b2468585..411e807ca605 100755
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
@@ -325,7 +325,6 @@ def collate_fn(self, batch, tp_workers=0):
         input_ids = copy.deepcopy(orig_input_ids)
         answer_starts = copy.deepcopy(orig_answer_starts)
 
-
         # Pad taskname_ids to be the same length for the prompt encoder
         if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER:
             max_taskname_length = max(len(ids) for ids in taskname_ids)
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
index 0d1521decda7..3a6cf921193f 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
@@ -200,7 +200,6 @@ def collate_fn(self, batch):
         enc_input = copy.deepcopy(orig_enc_input)
         dec_input = copy.deepcopy(orig_dec_input)
         dec_labels = copy.deepcopy(orig_dec_labels)
-        
 
         taskname_ids = self.pad_taskname_ids(taskname_ids)
 

From 855a3b83f1acac088ebd9dd389bb741143c421cb Mon Sep 17 00:00:00 2001
From: arendu <adithya.r@gmail.com>
Date: Wed, 9 Nov 2022 15:08:56 -0800
Subject: [PATCH 3/5] fix .extend which is an inplace op

Signed-off-by: arendu <adithya.r@gmail.com>
---
 .../megatron/gpt_prompt_learning_dataset.py              | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
index 2c23b2468585..189b99307cbd 100755
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 import json
 import os
 import pickle
@@ -320,10 +319,7 @@ def __getitem__(self, idx):
 
     def collate_fn(self, batch, tp_workers=0):
         """ Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch """
-        orig_taskname_ids, orig_input_ids, orig_answer_starts = zip(*batch)
-        taskname_ids = copy.deepcopy(orig_taskname_ids)
-        input_ids = copy.deepcopy(orig_input_ids)
-        answer_starts = copy.deepcopy(orig_answer_starts)
+        taskname_ids, input_ids, answer_starts = zip(*batch)
 
 
         # Pad taskname_ids to be the same length for the prompt encoder
@@ -380,7 +376,8 @@ def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts):
             # Pad to max length
             input_length = len(ids)
             padding_length = batch_max - input_length
-            ids.extend([self.pad_token_id] * padding_length)
+            pad_extend = [self.pad_token_id] * padding_length
+            ids = ids + pad_extend
 
             # Account for padding in loss mask
             loss_mask.extend([0.0] * padding_length)

From f04ab5ee85842e4b826ce8ef32e0d8545ccf0b91 Mon Sep 17 00:00:00 2001
From: arendu <adithya.r@gmail.com>
Date: Wed, 9 Nov 2022 15:10:20 -0800
Subject: [PATCH 4/5] deepcopy not needed in t5

Signed-off-by: arendu <adithya.r@gmail.com>
---
 .../megatron/t5_prompt_learning_dataset.py                 | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
index 3a6cf921193f..2858d9d183df 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 import json
 
 import torch
@@ -195,11 +194,7 @@ def _insert_text_in_template(self, input_example, prompt_template_fields, doc, a
     def collate_fn(self, batch):
         """ Prepares enc_input, dec_input, labels, loss_mask, enc_mask, dec_mask, position_ids, taskname_ids for global batch """
 
-        orig_taskname_ids, orig_enc_input, orig_dec_input, orig_dec_labels = zip(*batch)
-        taskname_ids = copy.deepcopy(orig_taskname_ids)
-        enc_input = copy.deepcopy(orig_enc_input)
-        dec_input = copy.deepcopy(orig_dec_input)
-        dec_labels = copy.deepcopy(orig_dec_labels)
+        taskname_ids, enc_input, dec_input, dec_labels = zip(*batch)
 
         taskname_ids = self.pad_taskname_ids(taskname_ids)
 

From 33a34724553319a43206e5294f97d811898b996d Mon Sep 17 00:00:00 2001
From: arendu <adithya.r@gmail.com>
Date: Wed, 9 Nov 2022 15:29:52 -0800
Subject: [PATCH 5/5] removed extend inplace op

Signed-off-by: arendu <adithya.r@gmail.com>
---
 .../megatron/gpt_prompt_learning_dataset.py                 | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
index a78e917a5aa8..69cd485b0ca5 100755
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
@@ -364,6 +364,7 @@ def collate_fn(self, batch, tp_workers=0):
     def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts):
         """ Pad input_ids in batch to max batch length while building loss mask """
         batch_loss_masks = []
+        padded_input_ids = []
         for ids, answer_start_idx in zip(input_ids, answer_starts):
             if answer_start_idx is not None:
                 # Loss mask where answer tokens are 1.0 and all other tokens are 0.0
@@ -377,16 +378,17 @@ def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts):
             padding_length = batch_max - input_length
             pad_extend = [self.pad_token_id] * padding_length
             ids = ids + pad_extend
+            padded_input_ids.append(ids)
 
             # Account for padding in loss mask
             loss_mask.extend([0.0] * padding_length)
             batch_loss_masks.append(torch.tensor(loss_mask, dtype=torch.float))
 
         # Make into torch tensors
-        input_ids = torch.tensor(input_ids, dtype=torch.long)
+        padded_input_ids = torch.tensor(padded_input_ids, dtype=torch.long)
         batch_loss_masks = torch.stack(batch_loss_masks)
 
-        return input_ids, batch_loss_masks
+        return padded_input_ids, batch_loss_masks
 
     def inference_collate_fn(self, batch):
         """