From dfd0f67c68e82ac4729806283a44c5d4dfdb0432 Mon Sep 17 00:00:00 2001 From: Chris Saunders Date: Wed, 25 Dec 2024 19:23:41 +1000 Subject: [PATCH 01/11] Add Prodigy Plus Schedule Free optimizer --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/Nerogar/OneTrainer?shareId=XXXX-XXXX-XXXX-XXXX). --- modules/ui/OptimizerParamsWindow.py | 13 ++++++++++++- modules/util/create.py | 28 ++++++++++++++++++++++++++++ modules/util/enum/Optimizer.py | 4 ++++ modules/util/optimizer_util.py | 22 ++++++++++++++++++++++ requirements-global.txt | 1 + 5 files changed, 67 insertions(+), 1 deletion(-) diff --git a/modules/ui/OptimizerParamsWindow.py b/modules/ui/OptimizerParamsWindow.py index 74a5883d..d7524a6b 100644 --- a/modules/ui/OptimizerParamsWindow.py +++ b/modules/ui/OptimizerParamsWindow.py @@ -143,7 +143,13 @@ def create_dynamic_ui( 'adanorm': {'title': 'AdaNorm', 'tooltip': 'Whether to use the AdaNorm variant', 'type': 'bool'}, 'adam_debias': {'title': 'Adam Debias', 'tooltip': 'Only correct the denominator to avoid inflating step sizes early in training.', 'type': 'bool'}, 'cautious': {'title': 'Cautious', 'tooltip': 'Whether to use the Cautious variant.', 'type': 'bool'}, - + 'split_groups': {'title': 'Split Groups', 'tooltip': 'Whether to split parameter groups.', 'type': 'bool'}, + 'split_groups_mean': {'title': 'Split Groups Mean', 'tooltip': 'Whether to use mean for split groups.', 'type': 'bool'}, + 'factored': {'title': 'Factored', 'tooltip': 'Whether to use factored updates.', 'type': 'bool'}, + 'use_stableadamw': {'title': 'Use StableAdamW', 'tooltip': 'Whether to use StableAdamW variant.', 'type': 'bool'}, + 'use_muon_pp': {'title': 'Use Muon++', 'tooltip': 'Whether to use Muon++ variant.', 'type': 'bool'}, + 'use_cautious': {'title': 'Use Cautious', 'tooltip': 'Whether to use Cautious variant.', 'type': 'bool'}, + 'use_adopt': {'title': 'Use ADOPT', 'tooltip': 'Whether to use ADOPT variant.', 'type': 'bool'}, } # @formatter:on @@ -154,6 +160,11 @@ def create_dynamic_ui( # Extract the keys for the selected optimizer for index, key in enumerate(OPTIMIZER_DEFAULT_PARAMETERS[selected_optimizer].keys()): + if selected_optimizer == Optimizer.PRODIGY_PLUS_SCHEDULE_FREE and key not in [ + 'beta1', 'beta2', 'eps', 'weight_decay', 'decouple', 'use_bias_correction', 'safeguard_warmup', 'd0', 'd_coef', 'growth_rate', 'fsdp_in_use', 'split_groups', 'split_groups_mean', 'factored', 'fused_back_pass', 'use_stableadamw', 'use_muon_pp', 'use_cautious', 'use_adopt' + ]: + continue + arg_info = KEY_DETAIL_MAP[key] title = arg_info['title'] diff --git a/modules/util/create.py b/modules/util/create.py index 70c94ffe..e3052fff 100644 --- a/modules/util/create.py +++ b/modules/util/create.py @@ -831,6 +831,34 @@ def create_optimizer( fsdp_in_use=optimizer_config.fsdp_in_use if optimizer_config.fsdp_in_use is not None else False, ) + # PRODIGY_PLUS_SCHEDULE_FREE Optimizer + case Optimizer.PRODIGY_PLUS_SCHEDULE_FREE: + from prodigyplus.prodigy_plus_schedulefree import ProdigyPlusScheduleFree + optimizer = ProdigyPlusScheduleFree( + params=parameters, + lr=config.learning_rate, + betas=(optimizer_config.beta1 if optimizer_config.beta1 is not None else 0.9, + optimizer_config.beta2 if optimizer_config.beta2 is not None else 0.999), + beta3=optimizer_config.beta3 if optimizer_config.beta3 is not None else None, + eps=optimizer_config.eps if optimizer_config.eps is not None else 1e-8, + weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0, + decouple=optimizer_config.decouple if optimizer_config.decouple is not None else True, + use_bias_correction=optimizer_config.use_bias_correction if optimizer_config.use_bias_correction is not None else False, + safeguard_warmup=optimizer_config.safeguard_warmup if optimizer_config.safeguard_warmup is not None else False, + d0=optimizer_config.d0 if optimizer_config.d0 is not None else 1e-6, + d_coef=optimizer_config.d_coef if optimizer_config.d_coef is not None else 1.0, + growth_rate=optimizer_config.growth_rate if optimizer_config.growth_rate is not None else float('inf'), + fsdp_in_use=optimizer_config.fsdp_in_use if optimizer_config.fsdp_in_use is not None else False, + split_groups=optimizer_config.split_groups if optimizer_config.split_groups is not None else True, + split_groups_mean=optimizer_config.split_groups_mean if optimizer_config.split_groups_mean is not None else True, + factored=optimizer_config.factored if optimizer_config.factored is not None else True, + fused_back_pass=optimizer_config.fused_back_pass if optimizer_config.fused_back_pass is not None else False, + use_stableadamw=optimizer_config.use_stableadamw if optimizer_config.use_stableadamw is not None else True, + use_muon_pp=optimizer_config.use_muon_pp if optimizer_config.use_muon_pp is not None else False, + use_cautious=optimizer_config.use_cautious if optimizer_config.use_cautious is not None else False, + use_adopt=optimizer_config.use_adopt if optimizer_config.use_adopt is not None else False, + ) + # ADAFactor Optimizer case Optimizer.ADAFACTOR: from transformers.optimization import Adafactor diff --git a/modules/util/enum/Optimizer.py b/modules/util/enum/Optimizer.py index 4350b03d..5afa0017 100644 --- a/modules/util/enum/Optimizer.py +++ b/modules/util/enum/Optimizer.py @@ -52,6 +52,7 @@ class Optimizer(Enum): # Prodigy PRODIGY = 'PRODIGY' + PRODIGY_PLUS_SCHEDULE_FREE = 'PRODIGY_PLUS_SCHEDULE_FREE' # ADAFACTOR ADAFACTOR = 'ADAFACTOR' @@ -73,6 +74,7 @@ def is_adaptive(self): self.DADAPT_ADA_GRAD, self.DADAPT_LION, self.PRODIGY, + self.PRODIGY_PLUS_SCHEDULE_FREE, ] @property @@ -80,6 +82,7 @@ def is_schedule_free(self): return self in [ self.SCHEDULE_FREE_ADAMW, self.SCHEDULE_FREE_SGD, + self.PRODIGY_PLUS_SCHEDULE_FREE, ] def supports_fused_back_pass(self): @@ -88,6 +91,7 @@ def supports_fused_back_pass(self): Optimizer.CAME, Optimizer.ADAM, Optimizer.ADAMW, + Optimizer.PRODIGY_PLUS_SCHEDULE_FREE, ] # Small helper for adjusting learning rates to adaptive optimizers. diff --git a/modules/util/optimizer_util.py b/modules/util/optimizer_util.py index 263108c9..150a828a 100644 --- a/modules/util/optimizer_util.py +++ b/modules/util/optimizer_util.py @@ -270,6 +270,28 @@ def init_model_parameters( "growth_rate": float('inf'), "fsdp_in_use": False, }, + Optimizer.PRODIGY_PLUS_SCHEDULE_FREE: { + "beta1": 0.9, + "beta2": 0.999, + "beta3": None, + "eps": 1e-8, + "weight_decay": 0, + "decouple": True, + "use_bias_correction": False, + "safeguard_warmup": False, + "d0": 1e-6, + "d_coef": 1.0, + "growth_rate": float('inf'), + "fsdp_in_use": False, + "split_groups": True, + "split_groups_mean": True, + "factored": True, + "fused_back_pass": False, + "use_stableadamw": True, + "use_muon_pp": False, + "use_cautious": False, + "use_adopt": False, + }, Optimizer.DADAPT_ADA_GRAD: { "momentum": 0, "log_every": 0, diff --git a/requirements-global.txt b/requirements-global.txt index b15fa348..676a25b3 100644 --- a/requirements-global.txt +++ b/requirements-global.txt @@ -34,6 +34,7 @@ lion-pytorch==0.2.2 # lion optimizer prodigyopt==1.0 # prodigy optimizer schedulefree==1.3.0 # schedule-free optimizers pytorch_optimizer==3.3.0 # pytorch optimizers +prodigy-plus-schedule-free==1.8.0 # Profiling scalene==1.5.45 From f58e835cb7763a1d04840c537026bcb17d8aee2f Mon Sep 17 00:00:00 2001 From: Chris Saunders Date: Wed, 25 Dec 2024 21:06:32 +1000 Subject: [PATCH 02/11] Fixed parameter window hopefully --- modules/ui/OptimizerParamsWindow.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/ui/OptimizerParamsWindow.py b/modules/ui/OptimizerParamsWindow.py index d7524a6b..918078d8 100644 --- a/modules/ui/OptimizerParamsWindow.py +++ b/modules/ui/OptimizerParamsWindow.py @@ -142,8 +142,7 @@ def create_dynamic_ui( 'r': {'title': 'R', 'tooltip': 'EMA factor.', 'type': 'float'}, 'adanorm': {'title': 'AdaNorm', 'tooltip': 'Whether to use the AdaNorm variant', 'type': 'bool'}, 'adam_debias': {'title': 'Adam Debias', 'tooltip': 'Only correct the denominator to avoid inflating step sizes early in training.', 'type': 'bool'}, - 'cautious': {'title': 'Cautious', 'tooltip': 'Whether to use the Cautious variant.', 'type': 'bool'}, - 'split_groups': {'title': 'Split Groups', 'tooltip': 'Whether to split parameter groups.', 'type': 'bool'}, + 'cautious': {'title': 'Cautious', 'tooltip': 'Whether to use the Cautious variant.', 'type': 'bool'}, ('split_groups': {'title': 'Split Groups', 'tooltip': 'Whether to split parameter groups.', 'type': 'bool'}, 'split_groups_mean': {'title': 'Split Groups Mean', 'tooltip': 'Whether to use mean for split groups.', 'type': 'bool'}, 'factored': {'title': 'Factored', 'tooltip': 'Whether to use factored updates.', 'type': 'bool'}, 'use_stableadamw': {'title': 'Use StableAdamW', 'tooltip': 'Whether to use StableAdamW variant.', 'type': 'bool'}, From 170909ea884fa7b1da81bee9bc7eb52713870864 Mon Sep 17 00:00:00 2001 From: Chris Saunders Date: Wed, 25 Dec 2024 21:09:27 +1000 Subject: [PATCH 03/11] Add UI components and logic for `PRODIGY_PLUS_SCHEDULE_FREE` in `OptimizerParamsWindow.py` * Add `split_groups` parameter with title, tooltip, and type. * Adjust formatting for `cautious` parameter to align with new `split_groups` parameter. --- modules/ui/OptimizerParamsWindow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/ui/OptimizerParamsWindow.py b/modules/ui/OptimizerParamsWindow.py index 918078d8..75f38227 100644 --- a/modules/ui/OptimizerParamsWindow.py +++ b/modules/ui/OptimizerParamsWindow.py @@ -142,7 +142,8 @@ def create_dynamic_ui( 'r': {'title': 'R', 'tooltip': 'EMA factor.', 'type': 'float'}, 'adanorm': {'title': 'AdaNorm', 'tooltip': 'Whether to use the AdaNorm variant', 'type': 'bool'}, 'adam_debias': {'title': 'Adam Debias', 'tooltip': 'Only correct the denominator to avoid inflating step sizes early in training.', 'type': 'bool'}, - 'cautious': {'title': 'Cautious', 'tooltip': 'Whether to use the Cautious variant.', 'type': 'bool'}, ('split_groups': {'title': 'Split Groups', 'tooltip': 'Whether to split parameter groups.', 'type': 'bool'}, + 'cautious': {'title': 'Cautious', 'tooltip': 'Whether to use the Cautious variant.', 'type': 'bool'}, + 'split_groups': {'title': 'Split Groups', 'tooltip': 'Whether to split parameter groups.', 'type': 'bool'}, 'split_groups_mean': {'title': 'Split Groups Mean', 'tooltip': 'Whether to use mean for split groups.', 'type': 'bool'}, 'factored': {'title': 'Factored', 'tooltip': 'Whether to use factored updates.', 'type': 'bool'}, 'use_stableadamw': {'title': 'Use StableAdamW', 'tooltip': 'Whether to use StableAdamW variant.', 'type': 'bool'}, From 8d3155aade9c274ae3e0d1d49b6ce56ccbeb91b3 Mon Sep 17 00:00:00 2001 From: Chris Saunders Date: Wed, 25 Dec 2024 21:25:54 +1000 Subject: [PATCH 04/11] Add a bunch of missing params to trainconfig.py --- modules/util/config/TrainConfig.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/modules/util/config/TrainConfig.py b/modules/util/config/TrainConfig.py index ebebe3b1..fcc126e4 100644 --- a/modules/util/config/TrainConfig.py +++ b/modules/util/config/TrainConfig.py @@ -90,6 +90,13 @@ class TrainOptimizerConfig(BaseConfig): adanorm: bool adam_debias: bool cautious: bool + split_groups: bool + split_groups_mean: bool + factored: bool + use_stableadamw: bool + use_muon_pp: bool + use_cautious: bool + use_adopt: bool def __init__(self, data: list[(str, Any, type, bool)]): super().__init__(data) @@ -158,6 +165,13 @@ def default_values(): data.append(("adanorm", False, bool, False)) data.append(("adam_debias", False, bool, False)) data.append(("cautious", False, bool, False)) + data.append(("split_groups", True, bool, False)) + data.append(("split_groups_mean", True, bool, False)) + data.append(("factored", True, bool, False)) + data.append(("use_stableadamw", True, bool, False)) + data.append(("use_muon_pp", False, bool, False)) + data.append(("use_cautious", False, bool, False)) + data.append(("use_adopt", False, bool, False)) return TrainOptimizerConfig(data) From 1337f82948c2a8795f5816145eeec6414b21c4f9 Mon Sep 17 00:00:00 2001 From: Chris Saunders Date: Wed, 25 Dec 2024 21:39:56 +1000 Subject: [PATCH 05/11] Found another missing param and another unwanted one. --- modules/util/config/TrainConfig.py | 2 ++ modules/util/optimizer_util.py | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/util/config/TrainConfig.py b/modules/util/config/TrainConfig.py index fcc126e4..286f0db8 100644 --- a/modules/util/config/TrainConfig.py +++ b/modules/util/config/TrainConfig.py @@ -97,6 +97,7 @@ class TrainOptimizerConfig(BaseConfig): use_muon_pp: bool use_cautious: bool use_adopt: bool + prodigy_steps: int def __init__(self, data: list[(str, Any, type, bool)]): super().__init__(data) @@ -172,6 +173,7 @@ def default_values(): data.append(("use_muon_pp", False, bool, False)) data.append(("use_cautious", False, bool, False)) data.append(("use_adopt", False, bool, False)) + data.append(("prodigy_steps", 0, int, False)) return TrainOptimizerConfig(data) diff --git a/modules/util/optimizer_util.py b/modules/util/optimizer_util.py index 150a828a..27aa4d6b 100644 --- a/modules/util/optimizer_util.py +++ b/modules/util/optimizer_util.py @@ -276,13 +276,12 @@ def init_model_parameters( "beta3": None, "eps": 1e-8, "weight_decay": 0, - "decouple": True, "use_bias_correction": False, "safeguard_warmup": False, "d0": 1e-6, "d_coef": 1.0, + "prodigy_steps": 0, "growth_rate": float('inf'), - "fsdp_in_use": False, "split_groups": True, "split_groups_mean": True, "factored": True, From 09f6d2b842caec40fbb3ea078e40a38949f1da94 Mon Sep 17 00:00:00 2001 From: Chris Saunders Date: Wed, 25 Dec 2024 21:43:11 +1000 Subject: [PATCH 06/11] Damn you decouple --- modules/ui/OptimizerParamsWindow.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/ui/OptimizerParamsWindow.py b/modules/ui/OptimizerParamsWindow.py index 75f38227..7d5dc9d7 100644 --- a/modules/ui/OptimizerParamsWindow.py +++ b/modules/ui/OptimizerParamsWindow.py @@ -101,7 +101,6 @@ def create_dynamic_ui( 'd_coef': {'title': 'D Coefficient', 'tooltip': 'Coefficient in the expression for the estimate of d.', 'type': 'float'}, 'dampening': {'title': 'Dampening', 'tooltip': 'Dampening for optimizer_momentum.', 'type': 'float'}, 'decay_rate': {'title': 'Decay Rate', 'tooltip': 'Rate of decay for moment estimation.', 'type': 'float'}, - 'decouple': {'title': 'Decouple', 'tooltip': 'Use AdamW style optimizer_decoupled weight decay.', 'type': 'bool'}, 'differentiable': {'title': 'Differentiable', 'tooltip': 'Whether the optimization function is optimizer_differentiable.', 'type': 'bool'}, 'eps': {'title': 'EPS', 'tooltip': 'A small value to prevent division by zero.', 'type': 'float'}, 'eps2': {'title': 'EPS 2', 'tooltip': 'A small value to prevent division by zero.', 'type': 'float'}, @@ -161,7 +160,7 @@ def create_dynamic_ui( # Extract the keys for the selected optimizer for index, key in enumerate(OPTIMIZER_DEFAULT_PARAMETERS[selected_optimizer].keys()): if selected_optimizer == Optimizer.PRODIGY_PLUS_SCHEDULE_FREE and key not in [ - 'beta1', 'beta2', 'eps', 'weight_decay', 'decouple', 'use_bias_correction', 'safeguard_warmup', 'd0', 'd_coef', 'growth_rate', 'fsdp_in_use', 'split_groups', 'split_groups_mean', 'factored', 'fused_back_pass', 'use_stableadamw', 'use_muon_pp', 'use_cautious', 'use_adopt' + 'beta1', 'beta2', 'eps', 'weight_decay', 'use_bias_correction', 'safeguard_warmup', 'd0', 'd_coef', 'growth_rate', 'fsdp_in_use', 'split_groups', 'split_groups_mean', 'factored', 'fused_back_pass', 'use_stableadamw', 'use_muon_pp', 'use_cautious', 'use_adopt' ]: continue From 5c8affc95846e4d84c541995849bb0c704a61f8d Mon Sep 17 00:00:00 2001 From: Chris Saunders Date: Wed, 25 Dec 2024 21:50:17 +1000 Subject: [PATCH 07/11] Remove `decouple` parameter and add `prodigy_steps` parameter to `create_optimizer` function in `modules/util/create.py`. * **Remove `decouple` parameter:** - Remove `decouple` parameter from the `create_optimizer` function for different optimizer configurations. * **Add `prodigy_steps` parameter:** - Add `prodigy_steps` parameter to the `create_optimizer` function for `PRODIGY_PLUS_SCHEDULE_FREE` optimizer configuration. --- modules/util/create.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/modules/util/create.py b/modules/util/create.py index e3052fff..cd4d42b0 100644 --- a/modules/util/create.py +++ b/modules/util/create.py @@ -759,7 +759,6 @@ def create_optimizer( eps=optimizer_config.eps if optimizer_config.eps is not None else 1e-8, weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0, log_every=optimizer_config.log_every if optimizer_config.log_every is not None else 0, - decouple=optimizer_config.decouple if optimizer_config.decouple is not None else False, use_bias_correction=optimizer_config.use_bias_correction if optimizer_config.use_bias_correction is not None else False, d0=optimizer_config.d0 if optimizer_config.d0 is not None else 1e-6, growth_rate=optimizer_config.growth_rate if optimizer_config.growth_rate is not None else float('inf'), @@ -822,7 +821,6 @@ def create_optimizer( beta3=optimizer_config.beta3 if optimizer_config.beta3 is not None else None, eps=optimizer_config.eps if optimizer_config.eps is not None else 1e-8, weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0, - decouple=optimizer_config.decouple if optimizer_config.decouple is not None else True, use_bias_correction=optimizer_config.use_bias_correction if optimizer_config.use_bias_correction is not None else False, safeguard_warmup=optimizer_config.safeguard_warmup if optimizer_config.safeguard_warmup is not None else False, d0=optimizer_config.d0 if optimizer_config.d0 is not None else 1e-6, @@ -838,15 +836,15 @@ def create_optimizer( params=parameters, lr=config.learning_rate, betas=(optimizer_config.beta1 if optimizer_config.beta1 is not None else 0.9, - optimizer_config.beta2 if optimizer_config.beta2 is not None else 0.999), - beta3=optimizer_config.beta3 if optimizer_config.beta3 is not None else None, + optimizer_config.beta2 if optimizer_config.beta2 is not None else 0.999, + beta3=optimizer_config.beta3 if optimizer_config.beta3 is not None else None), eps=optimizer_config.eps if optimizer_config.eps is not None else 1e-8, weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0, - decouple=optimizer_config.decouple if optimizer_config.decouple is not None else True, use_bias_correction=optimizer_config.use_bias_correction if optimizer_config.use_bias_correction is not None else False, safeguard_warmup=optimizer_config.safeguard_warmup if optimizer_config.safeguard_warmup is not None else False, d0=optimizer_config.d0 if optimizer_config.d0 is not None else 1e-6, d_coef=optimizer_config.d_coef if optimizer_config.d_coef is not None else 1.0, + prodigy_steps=optimizer_config.prodigy_steps if optimizer_config.prodigy_steps is not None else 0, growth_rate=optimizer_config.growth_rate if optimizer_config.growth_rate is not None else float('inf'), fsdp_in_use=optimizer_config.fsdp_in_use if optimizer_config.fsdp_in_use is not None else False, split_groups=optimizer_config.split_groups if optimizer_config.split_groups is not None else True, From a05aa2b89d05f8e080afd45838b69eb76d9cb860 Mon Sep 17 00:00:00 2001 From: Chris Saunders Date: Wed, 25 Dec 2024 21:53:05 +1000 Subject: [PATCH 08/11] --- modules/util/create.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/util/create.py b/modules/util/create.py index cd4d42b0..473288a6 100644 --- a/modules/util/create.py +++ b/modules/util/create.py @@ -529,7 +529,7 @@ def create_optimizer( lr=config.learning_rate, betas=(optimizer_config.beta1 if optimizer_config.beta1 is not None else 0.9, optimizer_config.beta2 if optimizer_config.beta2 is not None else 0.999, - optimizer_config.beta3 if optimizer_config.beta1 is not None else 0.9999,), + optimizer_config.beta3 if optimizer_config.beta3 is not None else 0.9999), weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 1e-2, eps=optimizer_config.eps if optimizer_config.eps is not None else 1e-8, alpha=optimizer_config.alpha if optimizer_config.alpha is not None else 5, From ad658e4e2bfb1a749e0f07309cdba7c667cbd009 Mon Sep 17 00:00:00 2001 From: Chris Saunders Date: Wed, 25 Dec 2024 21:53:25 +1000 Subject: [PATCH 09/11] From 2e934c98d4610e242d57d8aecd07c2ebf368e6c0 Mon Sep 17 00:00:00 2001 From: Chris Saunders Date: Wed, 25 Dec 2024 21:55:45 +1000 Subject: [PATCH 10/11] --- modules/util/create.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/util/create.py b/modules/util/create.py index 473288a6..47f4295f 100644 --- a/modules/util/create.py +++ b/modules/util/create.py @@ -817,8 +817,8 @@ def create_optimizer( params=parameters, lr=config.learning_rate, betas=(optimizer_config.beta1 if optimizer_config.beta1 is not None else 0.9, - optimizer_config.beta2 if optimizer_config.beta2 is not None else 0.999), - beta3=optimizer_config.beta3 if optimizer_config.beta3 is not None else None, + optimizer_config.beta2 if optimizer_config.beta2 is not None else 0.999, + beta3=optimizer_config.beta3 if optimizer_config.beta3 is not None else None), eps=optimizer_config.eps if optimizer_config.eps is not None else 1e-8, weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0, use_bias_correction=optimizer_config.use_bias_correction if optimizer_config.use_bias_correction is not None else False, From 878c9c2af4d49321fc58c9d2e18f8d1277f2f396 Mon Sep 17 00:00:00 2001 From: Chris Saunders Date: Wed, 25 Dec 2024 21:55:45 +1000 Subject: [PATCH 11/11] Fixed all remaining bugs and trained successfully This commit adds a variant of the Prodigy Optimizer created by LoganBooker https://github.com/LoganBooker/prodigy-plus-schedule-free/tree/main It is both learning rate free and schedule free. It also contains experiemntal optimization techniques as well as general memory usage and performance , improvmentsc Use with constant scheduler Based on code from: https://github.com/facebookresearch/schedule_free https://github.com/konstmish/prodigy Incorporates improvements from these pull requests (credit to https://github.com/dxqbYD and https://github.com/sangoi-exe): https://github.com/konstmish/prodigy/pull/23 https://github.com/konstmish/prodigy/pull/22 https://github.com/konstmish/prodigy/pull/20 Supports fused backwards pass. Experimental featuures. ADOPT https://arxiv.org/abs/2411.02853 Cautious https://arxiv.org/pdf/2411.16085 MuonPP https://github.com/KellerJordan/Muon/blob/master/muon.py StableAdamW https://optimi.benjaminwarner.dev/optimizers/stableadamw/ Probably some other stuff I forgot. For full details --- modules/ui/OptimizerParamsWindow.py | 19 +++++++++++-------- modules/util/config/TrainConfig.py | 10 ++++++++++ modules/util/create.py | 16 ++++++---------- modules/util/optimizer_util.py | 15 ++++++++------- 4 files changed, 35 insertions(+), 25 deletions(-) diff --git a/modules/ui/OptimizerParamsWindow.py b/modules/ui/OptimizerParamsWindow.py index 7d5dc9d7..46c88456 100644 --- a/modules/ui/OptimizerParamsWindow.py +++ b/modules/ui/OptimizerParamsWindow.py @@ -141,15 +141,18 @@ def create_dynamic_ui( 'r': {'title': 'R', 'tooltip': 'EMA factor.', 'type': 'float'}, 'adanorm': {'title': 'AdaNorm', 'tooltip': 'Whether to use the AdaNorm variant', 'type': 'bool'}, 'adam_debias': {'title': 'Adam Debias', 'tooltip': 'Only correct the denominator to avoid inflating step sizes early in training.', 'type': 'bool'}, - 'cautious': {'title': 'Cautious', 'tooltip': 'Whether to use the Cautious variant.', 'type': 'bool'}, - 'split_groups': {'title': 'Split Groups', 'tooltip': 'Whether to split parameter groups.', 'type': 'bool'}, - 'split_groups_mean': {'title': 'Split Groups Mean', 'tooltip': 'Whether to use mean for split groups.', 'type': 'bool'}, - 'factored': {'title': 'Factored', 'tooltip': 'Whether to use factored updates.', 'type': 'bool'}, - 'use_stableadamw': {'title': 'Use StableAdamW', 'tooltip': 'Whether to use StableAdamW variant.', 'type': 'bool'}, + 'split_groups': {'title': 'Split Groups', 'tooltip': 'Track individual adaptation values for each parameter group. Recommended: True', 'type': 'bool'}, + 'split_groups_mean': {'title': 'Split Groups Mean', 'tooltip': 'When split_groups is True, use the harmonic mean of learning rates for all groups. This favours a more conservative LR', 'type': 'bool'}, + 'factored': {'title': 'Factored', 'tooltip': 'Use factored approximation of the second moment, similar to Adafactor. Recommended: True', 'type': 'bool'}, + 'use_stableadamw': {'title': 'Use StableAdamW', 'tooltip': 'Scales parameter updates by the root-mean-square of the normalised gradient, in essence identical to Adafactors gradient scaling. Recommended: True', 'type': 'bool'}, 'use_muon_pp': {'title': 'Use Muon++', 'tooltip': 'Whether to use Muon++ variant.', 'type': 'bool'}, - 'use_cautious': {'title': 'Use Cautious', 'tooltip': 'Whether to use Cautious variant.', 'type': 'bool'}, - 'use_adopt': {'title': 'Use ADOPT', 'tooltip': 'Whether to use ADOPT variant.', 'type': 'bool'}, + 'use_cautious': {'title': 'Use Cautious', 'tooltip': 'Experimental. Perform "cautious" updates, as proposed in https://arxiv.org/pdf/2411.16085. Recommended: False', 'type': 'bool'}, + 'use_adopt': {'title': 'Use ADOPT', 'tooltip': 'Experimental. Partial implementation of (https://arxiv.org/abs/2411.02853). Recommended: False', 'type': 'bool'}, + 'lr': {'title': 'Learning Rate', 'tooltip': 'Learning rate adjustment parameter. Increases or decreases the Prodigy learning rate. Recommended: 1.0', 'type': 'float'}, + 'weignt_decay_by_lr': {'title': 'Weight Decay by LR', 'tooltip': 'If True, weight_decay is multiplied by the adaptive learning rate. Recommended: True', 'type': 'bool'}, + 'prodigy_steps': {'title': 'Prodigy Steps', 'tooltip': 'Freeze Prodigy stepsize adjustments after a certain optimiser step and releases all state memory required. Reccomended: 25% total num steps', 'type': 'int'}, } + # @formatter:on if not self.winfo_exists(): # check if this window isn't open @@ -160,7 +163,7 @@ def create_dynamic_ui( # Extract the keys for the selected optimizer for index, key in enumerate(OPTIMIZER_DEFAULT_PARAMETERS[selected_optimizer].keys()): if selected_optimizer == Optimizer.PRODIGY_PLUS_SCHEDULE_FREE and key not in [ - 'beta1', 'beta2', 'eps', 'weight_decay', 'use_bias_correction', 'safeguard_warmup', 'd0', 'd_coef', 'growth_rate', 'fsdp_in_use', 'split_groups', 'split_groups_mean', 'factored', 'fused_back_pass', 'use_stableadamw', 'use_muon_pp', 'use_cautious', 'use_adopt' + 'beta1', 'beta2', 'eps', 'weight_decay', 'use_bias_correction', 'safeguard_warmup', 'd0', 'd_coef', 'growth_rate', 'fsdp_in_use', 'split_groups', 'split_groups_mean', 'factored', 'fused_back_pass', 'use_stableadamw', 'use_muon_pp', 'use_cautious', 'use_adopt', 'weignt_decay_by_lr', 'prodigy_steps' ]: continue diff --git a/modules/util/config/TrainConfig.py b/modules/util/config/TrainConfig.py index 286f0db8..89a57683 100644 --- a/modules/util/config/TrainConfig.py +++ b/modules/util/config/TrainConfig.py @@ -98,6 +98,16 @@ class TrainOptimizerConfig(BaseConfig): use_cautious: bool use_adopt: bool prodigy_steps: int + use_adopt: bool + use_cautious: bool + use_muon_pp: bool + use_stableadamw: bool + weight_decay_by_lr: bool + factored: bool + split_groups: bool + split_groups_mean: bool + fused_back_pass: bool + def __init__(self, data: list[(str, Any, type, bool)]): super().__init__(data) diff --git a/modules/util/create.py b/modules/util/create.py index 473288a6..d7f89820 100644 --- a/modules/util/create.py +++ b/modules/util/create.py @@ -817,10 +817,8 @@ def create_optimizer( params=parameters, lr=config.learning_rate, betas=(optimizer_config.beta1 if optimizer_config.beta1 is not None else 0.9, - optimizer_config.beta2 if optimizer_config.beta2 is not None else 0.999), - beta3=optimizer_config.beta3 if optimizer_config.beta3 is not None else None, + optimizer_config.beta2 if optimizer_config.beta2 is not None else 0.999), eps=optimizer_config.eps if optimizer_config.eps is not None else 1e-8, - weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0, use_bias_correction=optimizer_config.use_bias_correction if optimizer_config.use_bias_correction is not None else False, safeguard_warmup=optimizer_config.safeguard_warmup if optimizer_config.safeguard_warmup is not None else False, d0=optimizer_config.d0 if optimizer_config.d0 is not None else 1e-6, @@ -831,22 +829,18 @@ def create_optimizer( # PRODIGY_PLUS_SCHEDULE_FREE Optimizer case Optimizer.PRODIGY_PLUS_SCHEDULE_FREE: - from prodigyplus.prodigy_plus_schedulefree import ProdigyPlusScheduleFree + from prodigyplus import ProdigyPlusScheduleFree optimizer = ProdigyPlusScheduleFree( params=parameters, lr=config.learning_rate, betas=(optimizer_config.beta1 if optimizer_config.beta1 is not None else 0.9, - optimizer_config.beta2 if optimizer_config.beta2 is not None else 0.999, - beta3=optimizer_config.beta3 if optimizer_config.beta3 is not None else None), + optimizer_config.beta2 if optimizer_config.beta2 is not None else 0.999), eps=optimizer_config.eps if optimizer_config.eps is not None else 1e-8, weight_decay=optimizer_config.weight_decay if optimizer_config.weight_decay is not None else 0, use_bias_correction=optimizer_config.use_bias_correction if optimizer_config.use_bias_correction is not None else False, - safeguard_warmup=optimizer_config.safeguard_warmup if optimizer_config.safeguard_warmup is not None else False, d0=optimizer_config.d0 if optimizer_config.d0 is not None else 1e-6, d_coef=optimizer_config.d_coef if optimizer_config.d_coef is not None else 1.0, prodigy_steps=optimizer_config.prodigy_steps if optimizer_config.prodigy_steps is not None else 0, - growth_rate=optimizer_config.growth_rate if optimizer_config.growth_rate is not None else float('inf'), - fsdp_in_use=optimizer_config.fsdp_in_use if optimizer_config.fsdp_in_use is not None else False, split_groups=optimizer_config.split_groups if optimizer_config.split_groups is not None else True, split_groups_mean=optimizer_config.split_groups_mean if optimizer_config.split_groups_mean is not None else True, factored=optimizer_config.factored if optimizer_config.factored is not None else True, @@ -855,6 +849,8 @@ def create_optimizer( use_muon_pp=optimizer_config.use_muon_pp if optimizer_config.use_muon_pp is not None else False, use_cautious=optimizer_config.use_cautious if optimizer_config.use_cautious is not None else False, use_adopt=optimizer_config.use_adopt if optimizer_config.use_adopt is not None else False, + stochastic_rounding=optimizer_config.stochastic_rounding if optimizer_config.stochastic_rounding is not None else True, + weight_decay_by_lr=optimizer_config.weight_decay_by_lr if optimizer_config.weight_decay_by_lr is not None else True, ) # ADAFactor Optimizer @@ -998,7 +994,7 @@ def create_optimizer( param_groups.append(old_group) old_group['lr'] = new_group['lr'] - old_group['initial_lr'] = new_group['initial_lr'] + old_group['initial_lr'] = new _group['initial_lr'] else: # the group state was not saved, initialize with an empty group state new_group = new_param_groups[new_group_index] diff --git a/modules/util/optimizer_util.py b/modules/util/optimizer_util.py index 27aa4d6b..60052195 100644 --- a/modules/util/optimizer_util.py +++ b/modules/util/optimizer_util.py @@ -42,9 +42,9 @@ def update_optimizer_config(train_config: TrainConfig): saved_optimizer_config = train_config.optimizer_defaults[str(optimizer)] saved_optimizer_config.from_dict(train_config.optimizer.to_dict()) else: - optimizer_donfig = TrainOptimizerConfig.default_values() - optimizer_donfig.from_dict(train_config.optimizer.to_dict()) - train_config.optimizer_defaults[str(optimizer)] = optimizer_donfig + optimizer_config = TrainOptimizerConfig.default_values() + optimizer_config.from_dict(train_config.optimizer.to_dict()) + train_config.optimizer_defaults[str(optimizer)] = optimizer_config def init_model_parameters( @@ -271,17 +271,17 @@ def init_model_parameters( "fsdp_in_use": False, }, Optimizer.PRODIGY_PLUS_SCHEDULE_FREE: { + "lr": 1.0, "beta1": 0.9, "beta2": 0.999, "beta3": None, - "eps": 1e-8, - "weight_decay": 0, + "weight_decay": 0.0, + "weignt_decay_by_lr": True, "use_bias_correction": False, - "safeguard_warmup": False, "d0": 1e-6, "d_coef": 1.0, "prodigy_steps": 0, - "growth_rate": float('inf'), + "eps": 1e-8, "split_groups": True, "split_groups_mean": True, "factored": True, @@ -290,6 +290,7 @@ def init_model_parameters( "use_muon_pp": False, "use_cautious": False, "use_adopt": False, + "stochastic_rounding": True, }, Optimizer.DADAPT_ADA_GRAD: { "momentum": 0,