From 36fefcf94b5150d641d0ce46a050af93cad21014 Mon Sep 17 00:00:00 2001 From: Aman Karmani Date: Sun, 6 Aug 2023 23:59:09 -0700 Subject: [PATCH 1/3] set group_by_length to false in examples --- examples/cerebras/qlora.yml | 2 +- examples/gptj/qlora.yml | 2 +- examples/llama-2/lora.yml | 2 +- examples/llama-2/qlora.yml | 2 +- examples/openllama-3b/qlora.yml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/cerebras/qlora.yml b/examples/cerebras/qlora.yml index 9340299b95..2f3700249e 100644 --- a/examples/cerebras/qlora.yml +++ b/examples/cerebras/qlora.yml @@ -35,7 +35,7 @@ torchdistx_path: lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false -group_by_length: true +group_by_length: false bf16: true fp16: false tf32: true diff --git a/examples/gptj/qlora.yml b/examples/gptj/qlora.yml index 858c148623..f2427f4d4b 100644 --- a/examples/gptj/qlora.yml +++ b/examples/gptj/qlora.yml @@ -32,7 +32,7 @@ torchdistx_path: lr_scheduler: cosine learning_rate: 0.0001 train_on_inputs: false -group_by_length: true +group_by_length: false bf16: true fp16: false tf32: true diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml index 74934320c8..494538fff9 100644 --- a/examples/llama-2/lora.yml +++ b/examples/llama-2/lora.yml @@ -38,7 +38,7 @@ lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false -group_by_length: true +group_by_length: false bf16: true fp16: false tf32: false diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml index 1a45e7268b..64728ac3d0 100644 --- a/examples/llama-2/qlora.yml +++ b/examples/llama-2/qlora.yml @@ -39,7 +39,7 @@ lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false -group_by_length: true +group_by_length: false bf16: true fp16: false tf32: false diff --git a/examples/openllama-3b/qlora.yml b/examples/openllama-3b/qlora.yml index 83ae31f914..dcad9bbcdf 100644 --- a/examples/openllama-3b/qlora.yml +++ b/examples/openllama-3b/qlora.yml @@ -34,7 +34,7 @@ torchdistx_path: lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false -group_by_length: true +group_by_length: false bf16: true fp16: false tf32: true From 9f991040388d7d0b3b36f24de64cb692503a8d71 Mon Sep 17 00:00:00 2001 From: Aman Karmani Date: Mon, 7 Aug 2023 01:04:56 -0700 Subject: [PATCH 2/3] update comment for group_by_length --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fe22bbc31b..479b1ab4ab 100644 --- a/README.md +++ b/README.md @@ -426,7 +426,8 @@ save_safetensors: # whether to mask out or include the human's prompt from the training labels train_on_inputs: false -# don't use this, leads to wonky training (according to someone on the internet) +# group similarly sized data to minimize padding +# may be slower to start as it must download and sort the entire dataset group_by_length: false # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing From b4d1d2278256dbac4240d4971c847d0f5df63b1d Mon Sep 17 00:00:00 2001 From: Aman Karmani Date: Mon, 7 Aug 2023 16:18:42 -0700 Subject: [PATCH 3/3] note pattern when using groups --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 479b1ab4ab..bbba22b8fa 100644 --- a/README.md +++ b/README.md @@ -427,7 +427,8 @@ save_safetensors: # whether to mask out or include the human's prompt from the training labels train_on_inputs: false # group similarly sized data to minimize padding -# may be slower to start as it must download and sort the entire dataset +# may be slower to start, as it must download and sort the entire dataset +# note that training loss may have an oscillating pattern with this enabled group_by_length: false # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing