axolotl-ai-cloud · tmm1 · Aug 9, 2023 · Aug 7, 2023 · Aug 7, 2023 · Aug 7, 2023
diff --git a/README.md b/README.md
@@ -426,7 +426,9 @@ save_safetensors:
 
 # whether to mask out or include the human's prompt from the training labels
 train_on_inputs: false
-# don't use this, leads to wonky training (according to someone on the internet)
+# group similarly sized data to minimize padding
+# may be slower to start, as it must download and sort the entire dataset
+# note that training loss may have an oscillating pattern with this enabled
 group_by_length: false
 
 # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing

diff --git a/examples/cerebras/qlora.yml b/examples/cerebras/qlora.yml
@@ -35,7 +35,7 @@ torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: true

diff --git a/examples/gptj/qlora.yml b/examples/gptj/qlora.yml
@@ -32,7 +32,7 @@ torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0001
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: true

diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml
@@ -38,7 +38,7 @@ lr_scheduler: cosine
 learning_rate: 0.0002
 
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: false

diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml
@@ -39,7 +39,7 @@ lr_scheduler: cosine
 learning_rate: 0.0002
 
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: false

diff --git a/examples/openllama-3b/qlora.yml b/examples/openllama-3b/qlora.yml
@@ -34,7 +34,7 @@ torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: true