-
-
Notifications
You must be signed in to change notification settings - Fork 173
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Prodigy Plus Schedule Free optimizer #614
base: master
Are you sure you want to change the base?
Changes from all commits
dfd0f67
f58e835
170909e
8d3155a
1337f82
09f6d2b
5c8affc
a05aa2b
ad658e4
2e934c9
878c9c2
af67848
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -101,7 +101,6 @@ def create_dynamic_ui( | |
'd_coef': {'title': 'D Coefficient', 'tooltip': 'Coefficient in the expression for the estimate of d.', 'type': 'float'}, | ||
'dampening': {'title': 'Dampening', 'tooltip': 'Dampening for optimizer_momentum.', 'type': 'float'}, | ||
'decay_rate': {'title': 'Decay Rate', 'tooltip': 'Rate of decay for moment estimation.', 'type': 'float'}, | ||
'decouple': {'title': 'Decouple', 'tooltip': 'Use AdamW style optimizer_decoupled weight decay.', 'type': 'bool'}, | ||
'differentiable': {'title': 'Differentiable', 'tooltip': 'Whether the optimization function is optimizer_differentiable.', 'type': 'bool'}, | ||
'eps': {'title': 'EPS', 'tooltip': 'A small value to prevent division by zero.', 'type': 'float'}, | ||
'eps2': {'title': 'EPS 2', 'tooltip': 'A small value to prevent division by zero.', 'type': 'float'}, | ||
|
@@ -142,9 +141,18 @@ def create_dynamic_ui( | |
'r': {'title': 'R', 'tooltip': 'EMA factor.', 'type': 'float'}, | ||
'adanorm': {'title': 'AdaNorm', 'tooltip': 'Whether to use the AdaNorm variant', 'type': 'bool'}, | ||
'adam_debias': {'title': 'Adam Debias', 'tooltip': 'Only correct the denominator to avoid inflating step sizes early in training.', 'type': 'bool'}, | ||
'cautious': {'title': 'Cautious', 'tooltip': 'Whether to use the Cautious variant.', 'type': 'bool'}, | ||
|
||
'split_groups': {'title': 'Split Groups', 'tooltip': 'Track individual adaptation values for each parameter group. Recommended: True', 'type': 'bool'}, | ||
'split_groups_mean': {'title': 'Split Groups Mean', 'tooltip': 'When split_groups is True, use the harmonic mean of learning rates for all groups. This favours a more conservative LR', 'type': 'bool'}, | ||
'factored': {'title': 'Factored', 'tooltip': 'Use factored approximation of the second moment, similar to Adafactor. Recommended: True', 'type': 'bool'}, | ||
'use_stableadamw': {'title': 'Use StableAdamW', 'tooltip': 'Scales parameter updates by the root-mean-square of the normalised gradient, in essence identical to Adafactors gradient scaling. Recommended: True', 'type': 'bool'}, | ||
'use_muon_pp': {'title': 'Use Muon++', 'tooltip': 'Whether to use Muon++ variant.', 'type': 'bool'}, | ||
'use_cautious': {'title': 'Use Cautious', 'tooltip': 'Experimental. Perform "cautious" updates, as proposed in https://arxiv.org/pdf/2411.16085. Recommended: False', 'type': 'bool'}, | ||
'use_adopt': {'title': 'Use ADOPT', 'tooltip': 'Experimental. Partial implementation of (https://arxiv.org/abs/2411.02853). Recommended: False', 'type': 'bool'}, | ||
'lr': {'title': 'Learning Rate', 'tooltip': 'Learning rate adjustment parameter. Increases or decreases the Prodigy learning rate. Recommended: 1.0', 'type': 'float'}, | ||
'weignt_decay_by_lr': {'title': 'Weight Decay by LR', 'tooltip': 'If True, weight_decay is multiplied by the adaptive learning rate. Recommended: True', 'type': 'bool'}, | ||
'prodigy_steps': {'title': 'Prodigy Steps', 'tooltip': 'Freeze Prodigy stepsize adjustments after a certain optimiser step and releases all state memory required. Reccomended: 25% total num steps', 'type': 'int'}, | ||
} | ||
|
||
# @formatter:on | ||
|
||
if not self.winfo_exists(): # check if this window isn't open | ||
|
@@ -154,6 +162,11 @@ def create_dynamic_ui( | |
|
||
# Extract the keys for the selected optimizer | ||
for index, key in enumerate(OPTIMIZER_DEFAULT_PARAMETERS[selected_optimizer].keys()): | ||
if selected_optimizer == Optimizer.PRODIGY_PLUS_SCHEDULE_FREE and key not in [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you explain this change? I'm not quite sure if or why it's needed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @saunderez Can you comment on this? I don't want to merge something if I don't understand why it's done There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is a side effect of setting the lr to 1.0 in the OPTIMIZER_DEFAULT_PARAMETERS and not wanting to present it as configurable.
Is really what it's doing here. given the other learning rate free optimizers don't do this it's probably better if Out of scope for this change would be to fix this sharp edge for all the optimizers that expect a lr of 1.0 |
||
'beta1', 'beta2', 'eps', 'weight_decay', 'use_bias_correction', 'safeguard_warmup', 'd0', 'd_coef', 'growth_rate', 'fsdp_in_use', 'split_groups', 'split_groups_mean', 'factored', 'fused_back_pass', 'use_stableadamw', 'use_muon_pp', 'use_cautious', 'use_adopt', 'weignt_decay_by_lr', 'prodigy_steps' | ||
]: | ||
continue | ||
|
||
arg_info = KEY_DETAIL_MAP[key] | ||
|
||
title = arg_info['title'] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,6 +90,24 @@ class TrainOptimizerConfig(BaseConfig): | |
adanorm: bool | ||
adam_debias: bool | ||
cautious: bool | ||
split_groups: bool | ||
split_groups_mean: bool | ||
factored: bool | ||
use_stableadamw: bool | ||
use_muon_pp: bool | ||
use_cautious: bool | ||
use_adopt: bool | ||
prodigy_steps: int | ||
use_adopt: bool | ||
use_cautious: bool | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this supposed to be here twice? |
||
use_muon_pp: bool | ||
use_stableadamw: bool | ||
weight_decay_by_lr: bool | ||
factored: bool | ||
split_groups: bool | ||
split_groups_mean: bool | ||
fused_back_pass: bool | ||
|
||
|
||
def __init__(self, data: list[(str, Any, type, bool)]): | ||
super().__init__(data) | ||
|
@@ -158,6 +176,14 @@ def default_values(): | |
data.append(("adanorm", False, bool, False)) | ||
data.append(("adam_debias", False, bool, False)) | ||
data.append(("cautious", False, bool, False)) | ||
data.append(("split_groups", True, bool, False)) | ||
data.append(("split_groups_mean", True, bool, False)) | ||
data.append(("factored", True, bool, False)) | ||
data.append(("use_stableadamw", True, bool, False)) | ||
data.append(("use_muon_pp", False, bool, False)) | ||
data.append(("use_cautious", False, bool, False)) | ||
data.append(("use_adopt", False, bool, False)) | ||
data.append(("prodigy_steps", 0, int, False)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing weight_decay_by_lr |
||
|
||
return TrainOptimizerConfig(data) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,6 +34,7 @@ lion-pytorch==0.2.2 # lion optimizer | |
prodigyopt==1.0 # prodigy optimizer | ||
schedulefree==1.3.0 # schedule-free optimizers | ||
pytorch_optimizer==3.3.0 # pytorch optimizers | ||
prodigy-plus-schedule-free==1.8.0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 1.9.0 got released rather recently. Only interface change is an extra parameter, |
||
|
||
# Profiling | ||
scalene==1.5.45 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
weignt_decay_by_lr
->weight_decay_by_lr
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually this is in a couple of places.