From 36fefcf94b5150d641d0ce46a050af93cad21014 Mon Sep 17 00:00:00 2001
From: Aman Karmani <aman@tmm1.net>
Date: Sun, 6 Aug 2023 23:59:09 -0700
Subject: [PATCH 1/3] set group_by_length to false in examples

---
 examples/cerebras/qlora.yml     | 2 +-
 examples/gptj/qlora.yml         | 2 +-
 examples/llama-2/lora.yml       | 2 +-
 examples/llama-2/qlora.yml      | 2 +-
 examples/openllama-3b/qlora.yml | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/cerebras/qlora.yml b/examples/cerebras/qlora.yml
index 9340299b95..2f3700249e 100644
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -35,7 +35,7 @@ torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: true
diff --git a/examples/gptj/qlora.yml b/examples/gptj/qlora.yml
index 858c148623..f2427f4d4b 100644
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -32,7 +32,7 @@ torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0001
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: true
diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml
index 74934320c8..494538fff9 100644
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -38,7 +38,7 @@ lr_scheduler: cosine
 learning_rate: 0.0002
 
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: false
diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml
index 1a45e7268b..64728ac3d0 100644
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -39,7 +39,7 @@ lr_scheduler: cosine
 learning_rate: 0.0002
 
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: false
diff --git a/examples/openllama-3b/qlora.yml b/examples/openllama-3b/qlora.yml
index 83ae31f914..dcad9bbcdf 100644
--- a/examples/openllama-3b/qlora.yml
+++ b/examples/openllama-3b/qlora.yml
@@ -34,7 +34,7 @@ torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: true

From 9f991040388d7d0b3b36f24de64cb692503a8d71 Mon Sep 17 00:00:00 2001
From: Aman Karmani <aman@tmm1.net>
Date: Mon, 7 Aug 2023 01:04:56 -0700
Subject: [PATCH 2/3] update comment for group_by_length

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fe22bbc31b..479b1ab4ab 100644
--- a/README.md
+++ b/README.md
@@ -426,7 +426,8 @@ save_safetensors:
 
 # whether to mask out or include the human's prompt from the training labels
 train_on_inputs: false
-# don't use this, leads to wonky training (according to someone on the internet)
+# group similarly sized data to minimize padding
+# may be slower to start as it must download and sort the entire dataset
 group_by_length: false
 
 # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing

From b4d1d2278256dbac4240d4971c847d0f5df63b1d Mon Sep 17 00:00:00 2001
From: Aman Karmani <aman@tmm1.net>
Date: Mon, 7 Aug 2023 16:18:42 -0700
Subject: [PATCH 3/3] note pattern when using groups

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 479b1ab4ab..bbba22b8fa 100644
--- a/README.md
+++ b/README.md
@@ -427,7 +427,8 @@ save_safetensors:
 # whether to mask out or include the human's prompt from the training labels
 train_on_inputs: false
 # group similarly sized data to minimize padding
-# may be slower to start as it must download and sort the entire dataset
+# may be slower to start, as it must download and sort the entire dataset
+# note that training loss may have an oscillating pattern with this enabled
 group_by_length: false
 
 # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing