diff --git a/README.md b/README.md index 89095bbe2..c06903f99 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas. -Currently, **73 optimizers (+ `bitsandbytes`, `qgalore`)**, **16 lr schedulers**, and **13 loss functions** are supported! +Currently, **74 optimizers (+ `bitsandbytes`, `qgalore`)**, **16 lr schedulers**, and **13 loss functions** are supported! Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer). @@ -94,83 +94,84 @@ from pytorch_optimizer import get_supported_optimizers supported_optimizers = get_supported_optimizers() ``` -| Optimizer | Description | Official Code | Paper | Citation | -|---------------|--------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| -| AdaBelief | *Adapting Step-sizes by the Belief in Observed Gradients* | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation) | -| AdaBound | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate* | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py) | | [cite](https://github.com/Luolc/AdaBound#citing) | -| AdaHessian | *An Adaptive Second Order Optimizer for Machine Learning* | [github](https://github.com/amirgholami/adahessian) | | [cite](https://github.com/amirgholami/adahessian#citation) | -| AdamD | *Improved bias-correction in Adam* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation) | -| AdamP | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights* | [github](https://github.com/clovaai/AdamP) | | [cite](https://github.com/clovaai/AdamP#how-to-cite) | -| diffGrad | *An Optimization Method for Convolutional Neural Networks* | [github](https://github.com/shivram1987/diffGrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation) | -| MADGRAD | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic* | [github](https://github.com/facebookresearch/madgrad) | | [cite](https://github.com/facebookresearch/madgrad#tech-report) | -| RAdam | *On the Variance of the Adaptive Learning Rate and Beyond* | [github](https://github.com/LiyuanLucasLiu/RAdam) | | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation) | -| Ranger | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer* | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) | | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work) | -| Ranger21 | *a synergistic deep learning optimizer* | [github](https://github.com/lessw2020/Ranger21) | | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work) | -| Lamb | *Large Batch Optimization for Deep Learning* | [github](https://github.com/cybertronai/pytorch-lamb) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation) | -| Shampoo | *Preconditioned Stochastic Tensor Optimization* | [github](https://github.com/moskomule/shampoo.pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation) | -| Nero | *Learning by Turning: Neural Architecture Aware Optimisation* | [github](https://github.com/jxbz/nero) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation) | -| Adan | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models* | [github](https://github.com/sail-sg/Adan) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation) | -| Adai | *Disentangling the Effects of Adaptive Learning Rate and Momentum* | [github](https://github.com/zeke-xie/adaptive-inertia-adai) | | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing) | -| SAM | *Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation) | -| ASAM | *Adaptive Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation) | -| GSAM | *Surrogate Gap Guided Sharpness-Aware Minimization* | [github](https://github.com/juntang-zhuang/GSAM) | | [cite](https://github.com/juntang-zhuang/GSAM#citation) | -| D-Adaptation | *Learning-Rate-Free Learning by D-Adaptation* | [github](https://github.com/facebookresearch/dadaptation) | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation) | -| AdaFactor | *Adaptive Learning Rates with Sublinear Memory Cost* | [github](https://github.com/DeadAt0m/adafactor-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation) | -| Apollo | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization* | [github](https://github.com/XuezheMax/apollo) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation) | -| NovoGrad | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks* | [github](https://github.com/lonePatient/NovoGrad-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation) | -| Lion | *Symbolic Discovery of Optimization Algorithms* | [github](https://github.com/google/automl/tree/master/lion) | | [cite](https://github.com/google/automl/tree/master/lion#citation) | -| Ali-G | *Adaptive Learning Rates for Interpolation with Gradients* | [github](https://github.com/oval-group/ali-g) | | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients) | -| SM3 | *Memory-Efficient Adaptive Optimization* | [github](https://github.com/google-research/google-research/tree/master/sm3) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation) | -| AdaNorm | *Adaptive Gradient Norm Correction based Optimizer for CNNs* | [github](https://github.com/shivram1987/AdaNorm) | | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation) | -| RotoGrad | *Gradient Homogenization in Multitask Learning* | [github](https://github.com/adrianjav/rotograd) | | [cite](https://github.com/adrianjav/rotograd#citing) | -| A2Grad | *Optimal Adaptive and Accelerated Stochastic Gradient Descent* | [github](https://github.com/severilov/A2Grad_optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation) | -| AccSGD | *Accelerating Stochastic Gradient Descent For Least Squares Regression* | [github](https://github.com/rahulkidambi/AccSGD) | | [cite](https://github.com/rahulkidambi/AccSGD#citation) | -| SGDW | *Decoupled Weight Decay Regularization* | [github](https://github.com/loshchil/AdamW-and-SGDW) | | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact) | -| ASGD | *Adaptive Gradient Descent without Descent* | [github](https://github.com/ymalitsky/adaptive_GD) | | [cite](https://github.com/ymalitsky/adaptive_GD#reference) | -| Yogi | *Adaptive Methods for Nonconvex Optimization* | | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) | -| SWATS | *Improving Generalization Performance by Switching from Adam to SGD* | | | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation) | -| Fromage | *On the distance between two neural networks and the stability of learning* | [github](https://github.com/jxbz/fromage) | | [cite](https://github.com/jxbz/fromage#citation) | -| MSVAG | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients* | [github](https://github.com/lballes/msvag) | | [cite](https://github.com/lballes/msvag#citation) | -| AdaMod | *An Adaptive and Momental Bound Method for Stochastic Learning* | [github](https://github.com/lancopku/AdaMod) | | [cite](https://github.com/lancopku/AdaMod#citation) | -| AggMo | *Aggregated Momentum: Stability Through Passive Damping* | [github](https://github.com/AtheMathmo/AggMo) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation) | -| QHAdam | *Quasi-hyperbolic momentum and Adam for deep learning* | [github](https://github.com/facebookresearch/qhoptim) | | [cite](https://github.com/facebookresearch/qhoptim#reference) | -| PID | *A PID Controller Approach for Stochastic Optimization of Deep Networks* | [github](https://github.com/tensorboy/PIDOptimizer) | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf) | [cite](https://github.com/tensorboy/PIDOptimizer#citation) | -| Gravity | *a Kinematic Approach on Optimization in Deep Learning* | [github](https://github.com/dariush-bahrami/gravity.optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation) | -| AdaSmooth | *An Adaptive Learning Rate Method based on Effective Ratio* | | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation) | +| Optimizer | Description | Official Code | Paper | Citation | +|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| +| AdaBelief | *Adapting Step-sizes by the Belief in Observed Gradients* | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation) | +| AdaBound | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate* | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py) | | [cite](https://github.com/Luolc/AdaBound#citing) | +| AdaHessian | *An Adaptive Second Order Optimizer for Machine Learning* | [github](https://github.com/amirgholami/adahessian) | | [cite](https://github.com/amirgholami/adahessian#citation) | +| AdamD | *Improved bias-correction in Adam* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation) | +| AdamP | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights* | [github](https://github.com/clovaai/AdamP) | | [cite](https://github.com/clovaai/AdamP#how-to-cite) | +| diffGrad | *An Optimization Method for Convolutional Neural Networks* | [github](https://github.com/shivram1987/diffGrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation) | +| MADGRAD | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic* | [github](https://github.com/facebookresearch/madgrad) | | [cite](https://github.com/facebookresearch/madgrad#tech-report) | +| RAdam | *On the Variance of the Adaptive Learning Rate and Beyond* | [github](https://github.com/LiyuanLucasLiu/RAdam) | | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation) | +| Ranger | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer* | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) | | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work) | +| Ranger21 | *a synergistic deep learning optimizer* | [github](https://github.com/lessw2020/Ranger21) | | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work) | +| Lamb | *Large Batch Optimization for Deep Learning* | [github](https://github.com/cybertronai/pytorch-lamb) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation) | +| Shampoo | *Preconditioned Stochastic Tensor Optimization* | [github](https://github.com/moskomule/shampoo.pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation) | +| Nero | *Learning by Turning: Neural Architecture Aware Optimisation* | [github](https://github.com/jxbz/nero) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation) | +| Adan | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models* | [github](https://github.com/sail-sg/Adan) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation) | +| Adai | *Disentangling the Effects of Adaptive Learning Rate and Momentum* | [github](https://github.com/zeke-xie/adaptive-inertia-adai) | | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing) | +| SAM | *Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation) | +| ASAM | *Adaptive Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation) | +| GSAM | *Surrogate Gap Guided Sharpness-Aware Minimization* | [github](https://github.com/juntang-zhuang/GSAM) | | [cite](https://github.com/juntang-zhuang/GSAM#citation) | +| D-Adaptation | *Learning-Rate-Free Learning by D-Adaptation* | [github](https://github.com/facebookresearch/dadaptation) | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation) | +| AdaFactor | *Adaptive Learning Rates with Sublinear Memory Cost* | [github](https://github.com/DeadAt0m/adafactor-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation) | +| Apollo | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization* | [github](https://github.com/XuezheMax/apollo) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation) | +| NovoGrad | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks* | [github](https://github.com/lonePatient/NovoGrad-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation) | +| Lion | *Symbolic Discovery of Optimization Algorithms* | [github](https://github.com/google/automl/tree/master/lion) | | [cite](https://github.com/google/automl/tree/master/lion#citation) | +| Ali-G | *Adaptive Learning Rates for Interpolation with Gradients* | [github](https://github.com/oval-group/ali-g) | | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients) | +| SM3 | *Memory-Efficient Adaptive Optimization* | [github](https://github.com/google-research/google-research/tree/master/sm3) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation) | +| AdaNorm | *Adaptive Gradient Norm Correction based Optimizer for CNNs* | [github](https://github.com/shivram1987/AdaNorm) | | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation) | +| RotoGrad | *Gradient Homogenization in Multitask Learning* | [github](https://github.com/adrianjav/rotograd) | | [cite](https://github.com/adrianjav/rotograd#citing) | +| A2Grad | *Optimal Adaptive and Accelerated Stochastic Gradient Descent* | [github](https://github.com/severilov/A2Grad_optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation) | +| AccSGD | *Accelerating Stochastic Gradient Descent For Least Squares Regression* | [github](https://github.com/rahulkidambi/AccSGD) | | [cite](https://github.com/rahulkidambi/AccSGD#citation) | +| SGDW | *Decoupled Weight Decay Regularization* | [github](https://github.com/loshchil/AdamW-and-SGDW) | | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact) | +| ASGD | *Adaptive Gradient Descent without Descent* | [github](https://github.com/ymalitsky/adaptive_GD) | | [cite](https://github.com/ymalitsky/adaptive_GD#reference) | +| Yogi | *Adaptive Methods for Nonconvex Optimization* | | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) | +| SWATS | *Improving Generalization Performance by Switching from Adam to SGD* | | | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation) | +| Fromage | *On the distance between two neural networks and the stability of learning* | [github](https://github.com/jxbz/fromage) | | [cite](https://github.com/jxbz/fromage#citation) | +| MSVAG | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients* | [github](https://github.com/lballes/msvag) | | [cite](https://github.com/lballes/msvag#citation) | +| AdaMod | *An Adaptive and Momental Bound Method for Stochastic Learning* | [github](https://github.com/lancopku/AdaMod) | | [cite](https://github.com/lancopku/AdaMod#citation) | +| AggMo | *Aggregated Momentum: Stability Through Passive Damping* | [github](https://github.com/AtheMathmo/AggMo) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation) | +| QHAdam | *Quasi-hyperbolic momentum and Adam for deep learning* | [github](https://github.com/facebookresearch/qhoptim) | | [cite](https://github.com/facebookresearch/qhoptim#reference) | +| PID | *A PID Controller Approach for Stochastic Optimization of Deep Networks* | [github](https://github.com/tensorboy/PIDOptimizer) | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf) | [cite](https://github.com/tensorboy/PIDOptimizer#citation) | +| Gravity | *a Kinematic Approach on Optimization in Deep Learning* | [github](https://github.com/dariush-bahrami/gravity.optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation) | +| AdaSmooth | *An Adaptive Learning Rate Method based on Effective Ratio* | | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation) | | SRMM | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation) | -| AvaGrad | *Domain-independent Dominance of Adaptive Methods* | [github](https://github.com/lolemacs/avagrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation) | -| PCGrad | *Gradient Surgery for Multi-Task Learning* | [github](https://github.com/tianheyu927/PCGrad) | | [cite](https://github.com/tianheyu927/PCGrad#reference) | -| AMSGrad | *On the Convergence of Adam and Beyond* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation) | -| Lookahead | *k steps forward, 1 step back* | [github](https://github.com/pytorch/examples/tree/main/imagenet) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation) | -| PNM | *Manipulating Stochastic Gradient Noise to Improve Generalization* | [github](https://github.com/zeke-xie/Positive-Negative-Momentum) | | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing) | -| GC | *Gradient Centralization* | [github](https://github.com/Yonghongwei/Gradient-Centralization) | | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation) | -| AGC | *Adaptive Gradient Clipping* | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation) | -| Stable WD | *Understanding and Scheduling Weight Decay* | [github](https://github.com/zeke-xie/stable-weight-decay-regularization) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation) | -| Softplus T | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation) | -| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation) | -| Norm Loss | *An efficient yet effective regularization method for deep neural networks* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation) | -| AdaShift | *Decorrelation and Convergence of Adaptive Learning Rate Methods* | [github](https://github.com/MichaelKonobeev/adashift) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation) | -| AdaDelta | *An Adaptive Learning Rate Method* | | | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation) | -| Amos | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale* | [github](https://github.com/google-research/jestimator) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation) | -| SignSGD | *Compressed Optimisation for Non-Convex Problems* | [github](https://github.com/jxbz/signSGD) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation) | -| Sophia | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training* | [github](https://github.com/Liuhong99/Sophia) | | [cite](https://github.com/Liuhong99/Sophia) | -| Prodigy | *An Expeditiously Adaptive Parameter-Free Learner* | [github](https://github.com/konstmish/prodigy) | | [cite](https://github.com/konstmish/prodigy#how-to-cite) | -| PAdam | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks* | [github](https://github.com/uclaml/Padam) | | [cite](https://github.com/uclaml/Padam#citation) | -| LOMO | *Full Parameter Fine-tuning for Large Language Models with Limited Resources* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | -| AdaLOMO | *Low-memory Optimization with Adaptive Learning Rate* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | -| Tiger | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious* | [github](https://github.com/bojone/tiger) | | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation) | -| CAME | *Confidence-guided Adaptive Memory Efficient Optimization* | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME) | | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation) | -| WSAM | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term* | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | | [cite](https://github.com/intelligent-machine-learning/dlrover) | -| Aida | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range* | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer) | | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida) | -| GaLore | *Memory-Efficient LLM Training by Gradient Low-Rank Projection* | [github](https://github.com/jiaweizzhao/GaLore) | | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation) | -| Adalite | *Adalite optimizer* | [github](https://github.com/VatsaDev/adalite) | | [cite](https://github.com/VatsaDev/adalite) | -| bSAM | *SAM as an Optimal Relaxation of Bayes* | [github](https://github.com/team-approx-bayes/bayesian-sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation) | -| Schedule-Free | *Schedule-Free Optimizers* | [github](https://github.com/facebookresearch/schedule_free) | | [cite](https://github.com/facebookresearch/schedule_free) | -| FAdam | *Adam is a natural gradient optimizer using diagonal empirical Fisher information* | [github](https://github.com/lessw2020/fadam_pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation) | -| Grokfast | *Accelerated Grokking by Amplifying Slow Gradients* | [github](https://github.com/ironjr/grokfast) | | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation) | -| Kate | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad* | [github](https://github.com/nazya/KATE) | | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) | -| StableAdamW | *Stable and low-precision training for large-scale vision-language models* | | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230413013W/exportcitation) | -| AdamMini | *Use Fewer Learning Rates To Gain More* | [github](https://github.com/zyushun/Adam-mini) | | [cite](https://github.com/zyushun/Adam-mini?tab=readme-ov-file#citation) | +| AvaGrad | *Domain-independent Dominance of Adaptive Methods* | [github](https://github.com/lolemacs/avagrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation) | +| PCGrad | *Gradient Surgery for Multi-Task Learning* | [github](https://github.com/tianheyu927/PCGrad) | | [cite](https://github.com/tianheyu927/PCGrad#reference) | +| AMSGrad | *On the Convergence of Adam and Beyond* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation) | +| Lookahead | *k steps forward, 1 step back* | [github](https://github.com/pytorch/examples/tree/main/imagenet) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation) | +| PNM | *Manipulating Stochastic Gradient Noise to Improve Generalization* | [github](https://github.com/zeke-xie/Positive-Negative-Momentum) | | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing) | +| GC | *Gradient Centralization* | [github](https://github.com/Yonghongwei/Gradient-Centralization) | | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation) | +| AGC | *Adaptive Gradient Clipping* | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation) | +| Stable WD | *Understanding and Scheduling Weight Decay* | [github](https://github.com/zeke-xie/stable-weight-decay-regularization) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation) | +| Softplus T | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation) | +| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation) | +| Norm Loss | *An efficient yet effective regularization method for deep neural networks* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation) | +| AdaShift | *Decorrelation and Convergence of Adaptive Learning Rate Methods* | [github](https://github.com/MichaelKonobeev/adashift) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation) | +| AdaDelta | *An Adaptive Learning Rate Method* | | | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation) | +| Amos | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale* | [github](https://github.com/google-research/jestimator) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation) | +| SignSGD | *Compressed Optimisation for Non-Convex Problems* | [github](https://github.com/jxbz/signSGD) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation) | +| Sophia | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training* | [github](https://github.com/Liuhong99/Sophia) | | [cite](https://github.com/Liuhong99/Sophia) | +| Prodigy | *An Expeditiously Adaptive Parameter-Free Learner* | [github](https://github.com/konstmish/prodigy) | | [cite](https://github.com/konstmish/prodigy#how-to-cite) | +| PAdam | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks* | [github](https://github.com/uclaml/Padam) | | [cite](https://github.com/uclaml/Padam#citation) | +| LOMO | *Full Parameter Fine-tuning for Large Language Models with Limited Resources* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | +| AdaLOMO | *Low-memory Optimization with Adaptive Learning Rate* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | +| Tiger | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious* | [github](https://github.com/bojone/tiger) | | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation) | +| CAME | *Confidence-guided Adaptive Memory Efficient Optimization* | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME) | | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation) | +| WSAM | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term* | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | | [cite](https://github.com/intelligent-machine-learning/dlrover) | +| Aida | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range* | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer) | | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida) | +| GaLore | *Memory-Efficient LLM Training by Gradient Low-Rank Projection* | [github](https://github.com/jiaweizzhao/GaLore) | | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation) | +| Adalite | *Adalite optimizer* | [github](https://github.com/VatsaDev/adalite) | | [cite](https://github.com/VatsaDev/adalite) | +| bSAM | *SAM as an Optimal Relaxation of Bayes* | [github](https://github.com/team-approx-bayes/bayesian-sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation) | +| Schedule-Free | *Schedule-Free Optimizers* | [github](https://github.com/facebookresearch/schedule_free) | | [cite](https://github.com/facebookresearch/schedule_free) | +| FAdam | *Adam is a natural gradient optimizer using diagonal empirical Fisher information* | [github](https://github.com/lessw2020/fadam_pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation) | +| Grokfast | *Accelerated Grokking by Amplifying Slow Gradients* | [github](https://github.com/ironjr/grokfast) | | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation) | +| Kate | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad* | [github](https://github.com/nazya/KATE) | | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) | +| StableAdamW | *Stable and low-precision training for large-scale vision-language models* | | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230413013W/exportcitation) | +| AdamMini | *Use Fewer Learning Rates To Gain More* | [github](https://github.com/zyushun/Adam-mini) | | [cite](https://github.com/zyushun/Adam-mini?tab=readme-ov-file#citation) | +| TRAC | *Adaptive Parameter-free Optimization* | [github](https://github.com/ComputationalRobotics/TRAC) | | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240516642M/exportcitation) | ## Supported LR Scheduler @@ -336,7 +337,7 @@ If you use this software, please cite it below. Or you can get it from "cite thi month = jan, title = {{pytorch_optimizer: optimizer & lr scheduler & loss function collections in PyTorch}}, url = {https://github.com/kozistr/pytorch_optimizer}, - version = {3.0.1}, + version = {3.1.0}, year = {2021} } diff --git a/docs/changelogs/v3.1.1.md b/docs/changelogs/v3.1.1.md new file mode 100644 index 000000000..07a3115df --- /dev/null +++ b/docs/changelogs/v3.1.1.md @@ -0,0 +1,11 @@ +## Change Log + +### Feature + +* Implement `TRAC` optimizer. (#263) + * [Fast TRAC: A Parameter-Free Optimizer for Lifelong Reinforcement Learning](https://arxiv.org/abs/2405.16642) +* Support `AdamW` optimizer via `create_optimizer()`. (#263) + +### Bug + +* Fix to handle the optimizers that only take the `model` instead of the parameters in `create_optimizer()`. (#263) diff --git a/docs/index.md b/docs/index.md index 89095bbe2..c06903f99 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,7 +10,7 @@ **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas. -Currently, **73 optimizers (+ `bitsandbytes`, `qgalore`)**, **16 lr schedulers**, and **13 loss functions** are supported! +Currently, **74 optimizers (+ `bitsandbytes`, `qgalore`)**, **16 lr schedulers**, and **13 loss functions** are supported! Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer). @@ -94,83 +94,84 @@ from pytorch_optimizer import get_supported_optimizers supported_optimizers = get_supported_optimizers() ``` -| Optimizer | Description | Official Code | Paper | Citation | -|---------------|--------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| -| AdaBelief | *Adapting Step-sizes by the Belief in Observed Gradients* | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation) | -| AdaBound | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate* | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py) | | [cite](https://github.com/Luolc/AdaBound#citing) | -| AdaHessian | *An Adaptive Second Order Optimizer for Machine Learning* | [github](https://github.com/amirgholami/adahessian) | | [cite](https://github.com/amirgholami/adahessian#citation) | -| AdamD | *Improved bias-correction in Adam* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation) | -| AdamP | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights* | [github](https://github.com/clovaai/AdamP) | | [cite](https://github.com/clovaai/AdamP#how-to-cite) | -| diffGrad | *An Optimization Method for Convolutional Neural Networks* | [github](https://github.com/shivram1987/diffGrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation) | -| MADGRAD | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic* | [github](https://github.com/facebookresearch/madgrad) | | [cite](https://github.com/facebookresearch/madgrad#tech-report) | -| RAdam | *On the Variance of the Adaptive Learning Rate and Beyond* | [github](https://github.com/LiyuanLucasLiu/RAdam) | | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation) | -| Ranger | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer* | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) | | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work) | -| Ranger21 | *a synergistic deep learning optimizer* | [github](https://github.com/lessw2020/Ranger21) | | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work) | -| Lamb | *Large Batch Optimization for Deep Learning* | [github](https://github.com/cybertronai/pytorch-lamb) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation) | -| Shampoo | *Preconditioned Stochastic Tensor Optimization* | [github](https://github.com/moskomule/shampoo.pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation) | -| Nero | *Learning by Turning: Neural Architecture Aware Optimisation* | [github](https://github.com/jxbz/nero) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation) | -| Adan | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models* | [github](https://github.com/sail-sg/Adan) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation) | -| Adai | *Disentangling the Effects of Adaptive Learning Rate and Momentum* | [github](https://github.com/zeke-xie/adaptive-inertia-adai) | | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing) | -| SAM | *Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation) | -| ASAM | *Adaptive Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation) | -| GSAM | *Surrogate Gap Guided Sharpness-Aware Minimization* | [github](https://github.com/juntang-zhuang/GSAM) | | [cite](https://github.com/juntang-zhuang/GSAM#citation) | -| D-Adaptation | *Learning-Rate-Free Learning by D-Adaptation* | [github](https://github.com/facebookresearch/dadaptation) | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation) | -| AdaFactor | *Adaptive Learning Rates with Sublinear Memory Cost* | [github](https://github.com/DeadAt0m/adafactor-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation) | -| Apollo | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization* | [github](https://github.com/XuezheMax/apollo) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation) | -| NovoGrad | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks* | [github](https://github.com/lonePatient/NovoGrad-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation) | -| Lion | *Symbolic Discovery of Optimization Algorithms* | [github](https://github.com/google/automl/tree/master/lion) | | [cite](https://github.com/google/automl/tree/master/lion#citation) | -| Ali-G | *Adaptive Learning Rates for Interpolation with Gradients* | [github](https://github.com/oval-group/ali-g) | | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients) | -| SM3 | *Memory-Efficient Adaptive Optimization* | [github](https://github.com/google-research/google-research/tree/master/sm3) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation) | -| AdaNorm | *Adaptive Gradient Norm Correction based Optimizer for CNNs* | [github](https://github.com/shivram1987/AdaNorm) | | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation) | -| RotoGrad | *Gradient Homogenization in Multitask Learning* | [github](https://github.com/adrianjav/rotograd) | | [cite](https://github.com/adrianjav/rotograd#citing) | -| A2Grad | *Optimal Adaptive and Accelerated Stochastic Gradient Descent* | [github](https://github.com/severilov/A2Grad_optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation) | -| AccSGD | *Accelerating Stochastic Gradient Descent For Least Squares Regression* | [github](https://github.com/rahulkidambi/AccSGD) | | [cite](https://github.com/rahulkidambi/AccSGD#citation) | -| SGDW | *Decoupled Weight Decay Regularization* | [github](https://github.com/loshchil/AdamW-and-SGDW) | | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact) | -| ASGD | *Adaptive Gradient Descent without Descent* | [github](https://github.com/ymalitsky/adaptive_GD) | | [cite](https://github.com/ymalitsky/adaptive_GD#reference) | -| Yogi | *Adaptive Methods for Nonconvex Optimization* | | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) | -| SWATS | *Improving Generalization Performance by Switching from Adam to SGD* | | | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation) | -| Fromage | *On the distance between two neural networks and the stability of learning* | [github](https://github.com/jxbz/fromage) | | [cite](https://github.com/jxbz/fromage#citation) | -| MSVAG | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients* | [github](https://github.com/lballes/msvag) | | [cite](https://github.com/lballes/msvag#citation) | -| AdaMod | *An Adaptive and Momental Bound Method for Stochastic Learning* | [github](https://github.com/lancopku/AdaMod) | | [cite](https://github.com/lancopku/AdaMod#citation) | -| AggMo | *Aggregated Momentum: Stability Through Passive Damping* | [github](https://github.com/AtheMathmo/AggMo) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation) | -| QHAdam | *Quasi-hyperbolic momentum and Adam for deep learning* | [github](https://github.com/facebookresearch/qhoptim) | | [cite](https://github.com/facebookresearch/qhoptim#reference) | -| PID | *A PID Controller Approach for Stochastic Optimization of Deep Networks* | [github](https://github.com/tensorboy/PIDOptimizer) | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf) | [cite](https://github.com/tensorboy/PIDOptimizer#citation) | -| Gravity | *a Kinematic Approach on Optimization in Deep Learning* | [github](https://github.com/dariush-bahrami/gravity.optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation) | -| AdaSmooth | *An Adaptive Learning Rate Method based on Effective Ratio* | | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation) | +| Optimizer | Description | Official Code | Paper | Citation | +|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| +| AdaBelief | *Adapting Step-sizes by the Belief in Observed Gradients* | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation) | +| AdaBound | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate* | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py) | | [cite](https://github.com/Luolc/AdaBound#citing) | +| AdaHessian | *An Adaptive Second Order Optimizer for Machine Learning* | [github](https://github.com/amirgholami/adahessian) | | [cite](https://github.com/amirgholami/adahessian#citation) | +| AdamD | *Improved bias-correction in Adam* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation) | +| AdamP | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights* | [github](https://github.com/clovaai/AdamP) | | [cite](https://github.com/clovaai/AdamP#how-to-cite) | +| diffGrad | *An Optimization Method for Convolutional Neural Networks* | [github](https://github.com/shivram1987/diffGrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation) | +| MADGRAD | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic* | [github](https://github.com/facebookresearch/madgrad) | | [cite](https://github.com/facebookresearch/madgrad#tech-report) | +| RAdam | *On the Variance of the Adaptive Learning Rate and Beyond* | [github](https://github.com/LiyuanLucasLiu/RAdam) | | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation) | +| Ranger | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer* | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) | | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work) | +| Ranger21 | *a synergistic deep learning optimizer* | [github](https://github.com/lessw2020/Ranger21) | | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work) | +| Lamb | *Large Batch Optimization for Deep Learning* | [github](https://github.com/cybertronai/pytorch-lamb) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation) | +| Shampoo | *Preconditioned Stochastic Tensor Optimization* | [github](https://github.com/moskomule/shampoo.pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation) | +| Nero | *Learning by Turning: Neural Architecture Aware Optimisation* | [github](https://github.com/jxbz/nero) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation) | +| Adan | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models* | [github](https://github.com/sail-sg/Adan) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation) | +| Adai | *Disentangling the Effects of Adaptive Learning Rate and Momentum* | [github](https://github.com/zeke-xie/adaptive-inertia-adai) | | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing) | +| SAM | *Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation) | +| ASAM | *Adaptive Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation) | +| GSAM | *Surrogate Gap Guided Sharpness-Aware Minimization* | [github](https://github.com/juntang-zhuang/GSAM) | | [cite](https://github.com/juntang-zhuang/GSAM#citation) | +| D-Adaptation | *Learning-Rate-Free Learning by D-Adaptation* | [github](https://github.com/facebookresearch/dadaptation) | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation) | +| AdaFactor | *Adaptive Learning Rates with Sublinear Memory Cost* | [github](https://github.com/DeadAt0m/adafactor-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation) | +| Apollo | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization* | [github](https://github.com/XuezheMax/apollo) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation) | +| NovoGrad | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks* | [github](https://github.com/lonePatient/NovoGrad-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation) | +| Lion | *Symbolic Discovery of Optimization Algorithms* | [github](https://github.com/google/automl/tree/master/lion) | | [cite](https://github.com/google/automl/tree/master/lion#citation) | +| Ali-G | *Adaptive Learning Rates for Interpolation with Gradients* | [github](https://github.com/oval-group/ali-g) | | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients) | +| SM3 | *Memory-Efficient Adaptive Optimization* | [github](https://github.com/google-research/google-research/tree/master/sm3) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation) | +| AdaNorm | *Adaptive Gradient Norm Correction based Optimizer for CNNs* | [github](https://github.com/shivram1987/AdaNorm) | | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation) | +| RotoGrad | *Gradient Homogenization in Multitask Learning* | [github](https://github.com/adrianjav/rotograd) | | [cite](https://github.com/adrianjav/rotograd#citing) | +| A2Grad | *Optimal Adaptive and Accelerated Stochastic Gradient Descent* | [github](https://github.com/severilov/A2Grad_optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation) | +| AccSGD | *Accelerating Stochastic Gradient Descent For Least Squares Regression* | [github](https://github.com/rahulkidambi/AccSGD) | | [cite](https://github.com/rahulkidambi/AccSGD#citation) | +| SGDW | *Decoupled Weight Decay Regularization* | [github](https://github.com/loshchil/AdamW-and-SGDW) | | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact) | +| ASGD | *Adaptive Gradient Descent without Descent* | [github](https://github.com/ymalitsky/adaptive_GD) | | [cite](https://github.com/ymalitsky/adaptive_GD#reference) | +| Yogi | *Adaptive Methods for Nonconvex Optimization* | | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) | +| SWATS | *Improving Generalization Performance by Switching from Adam to SGD* | | | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation) | +| Fromage | *On the distance between two neural networks and the stability of learning* | [github](https://github.com/jxbz/fromage) | | [cite](https://github.com/jxbz/fromage#citation) | +| MSVAG | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients* | [github](https://github.com/lballes/msvag) | | [cite](https://github.com/lballes/msvag#citation) | +| AdaMod | *An Adaptive and Momental Bound Method for Stochastic Learning* | [github](https://github.com/lancopku/AdaMod) | | [cite](https://github.com/lancopku/AdaMod#citation) | +| AggMo | *Aggregated Momentum: Stability Through Passive Damping* | [github](https://github.com/AtheMathmo/AggMo) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation) | +| QHAdam | *Quasi-hyperbolic momentum and Adam for deep learning* | [github](https://github.com/facebookresearch/qhoptim) | | [cite](https://github.com/facebookresearch/qhoptim#reference) | +| PID | *A PID Controller Approach for Stochastic Optimization of Deep Networks* | [github](https://github.com/tensorboy/PIDOptimizer) | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf) | [cite](https://github.com/tensorboy/PIDOptimizer#citation) | +| Gravity | *a Kinematic Approach on Optimization in Deep Learning* | [github](https://github.com/dariush-bahrami/gravity.optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation) | +| AdaSmooth | *An Adaptive Learning Rate Method based on Effective Ratio* | | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation) | | SRMM | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation) | -| AvaGrad | *Domain-independent Dominance of Adaptive Methods* | [github](https://github.com/lolemacs/avagrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation) | -| PCGrad | *Gradient Surgery for Multi-Task Learning* | [github](https://github.com/tianheyu927/PCGrad) | | [cite](https://github.com/tianheyu927/PCGrad#reference) | -| AMSGrad | *On the Convergence of Adam and Beyond* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation) | -| Lookahead | *k steps forward, 1 step back* | [github](https://github.com/pytorch/examples/tree/main/imagenet) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation) | -| PNM | *Manipulating Stochastic Gradient Noise to Improve Generalization* | [github](https://github.com/zeke-xie/Positive-Negative-Momentum) | | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing) | -| GC | *Gradient Centralization* | [github](https://github.com/Yonghongwei/Gradient-Centralization) | | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation) | -| AGC | *Adaptive Gradient Clipping* | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation) | -| Stable WD | *Understanding and Scheduling Weight Decay* | [github](https://github.com/zeke-xie/stable-weight-decay-regularization) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation) | -| Softplus T | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation) | -| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation) | -| Norm Loss | *An efficient yet effective regularization method for deep neural networks* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation) | -| AdaShift | *Decorrelation and Convergence of Adaptive Learning Rate Methods* | [github](https://github.com/MichaelKonobeev/adashift) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation) | -| AdaDelta | *An Adaptive Learning Rate Method* | | | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation) | -| Amos | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale* | [github](https://github.com/google-research/jestimator) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation) | -| SignSGD | *Compressed Optimisation for Non-Convex Problems* | [github](https://github.com/jxbz/signSGD) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation) | -| Sophia | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training* | [github](https://github.com/Liuhong99/Sophia) | | [cite](https://github.com/Liuhong99/Sophia) | -| Prodigy | *An Expeditiously Adaptive Parameter-Free Learner* | [github](https://github.com/konstmish/prodigy) | | [cite](https://github.com/konstmish/prodigy#how-to-cite) | -| PAdam | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks* | [github](https://github.com/uclaml/Padam) | | [cite](https://github.com/uclaml/Padam#citation) | -| LOMO | *Full Parameter Fine-tuning for Large Language Models with Limited Resources* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | -| AdaLOMO | *Low-memory Optimization with Adaptive Learning Rate* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | -| Tiger | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious* | [github](https://github.com/bojone/tiger) | | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation) | -| CAME | *Confidence-guided Adaptive Memory Efficient Optimization* | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME) | | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation) | -| WSAM | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term* | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | | [cite](https://github.com/intelligent-machine-learning/dlrover) | -| Aida | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range* | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer) | | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida) | -| GaLore | *Memory-Efficient LLM Training by Gradient Low-Rank Projection* | [github](https://github.com/jiaweizzhao/GaLore) | | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation) | -| Adalite | *Adalite optimizer* | [github](https://github.com/VatsaDev/adalite) | | [cite](https://github.com/VatsaDev/adalite) | -| bSAM | *SAM as an Optimal Relaxation of Bayes* | [github](https://github.com/team-approx-bayes/bayesian-sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation) | -| Schedule-Free | *Schedule-Free Optimizers* | [github](https://github.com/facebookresearch/schedule_free) | | [cite](https://github.com/facebookresearch/schedule_free) | -| FAdam | *Adam is a natural gradient optimizer using diagonal empirical Fisher information* | [github](https://github.com/lessw2020/fadam_pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation) | -| Grokfast | *Accelerated Grokking by Amplifying Slow Gradients* | [github](https://github.com/ironjr/grokfast) | | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation) | -| Kate | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad* | [github](https://github.com/nazya/KATE) | | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) | -| StableAdamW | *Stable and low-precision training for large-scale vision-language models* | | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230413013W/exportcitation) | -| AdamMini | *Use Fewer Learning Rates To Gain More* | [github](https://github.com/zyushun/Adam-mini) | | [cite](https://github.com/zyushun/Adam-mini?tab=readme-ov-file#citation) | +| AvaGrad | *Domain-independent Dominance of Adaptive Methods* | [github](https://github.com/lolemacs/avagrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation) | +| PCGrad | *Gradient Surgery for Multi-Task Learning* | [github](https://github.com/tianheyu927/PCGrad) | | [cite](https://github.com/tianheyu927/PCGrad#reference) | +| AMSGrad | *On the Convergence of Adam and Beyond* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation) | +| Lookahead | *k steps forward, 1 step back* | [github](https://github.com/pytorch/examples/tree/main/imagenet) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation) | +| PNM | *Manipulating Stochastic Gradient Noise to Improve Generalization* | [github](https://github.com/zeke-xie/Positive-Negative-Momentum) | | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing) | +| GC | *Gradient Centralization* | [github](https://github.com/Yonghongwei/Gradient-Centralization) | | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation) | +| AGC | *Adaptive Gradient Clipping* | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation) | +| Stable WD | *Understanding and Scheduling Weight Decay* | [github](https://github.com/zeke-xie/stable-weight-decay-regularization) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation) | +| Softplus T | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation) | +| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation) | +| Norm Loss | *An efficient yet effective regularization method for deep neural networks* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation) | +| AdaShift | *Decorrelation and Convergence of Adaptive Learning Rate Methods* | [github](https://github.com/MichaelKonobeev/adashift) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation) | +| AdaDelta | *An Adaptive Learning Rate Method* | | | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation) | +| Amos | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale* | [github](https://github.com/google-research/jestimator) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation) | +| SignSGD | *Compressed Optimisation for Non-Convex Problems* | [github](https://github.com/jxbz/signSGD) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation) | +| Sophia | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training* | [github](https://github.com/Liuhong99/Sophia) | | [cite](https://github.com/Liuhong99/Sophia) | +| Prodigy | *An Expeditiously Adaptive Parameter-Free Learner* | [github](https://github.com/konstmish/prodigy) | | [cite](https://github.com/konstmish/prodigy#how-to-cite) | +| PAdam | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks* | [github](https://github.com/uclaml/Padam) | | [cite](https://github.com/uclaml/Padam#citation) | +| LOMO | *Full Parameter Fine-tuning for Large Language Models with Limited Resources* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | +| AdaLOMO | *Low-memory Optimization with Adaptive Learning Rate* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | +| Tiger | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious* | [github](https://github.com/bojone/tiger) | | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation) | +| CAME | *Confidence-guided Adaptive Memory Efficient Optimization* | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME) | | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation) | +| WSAM | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term* | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | | [cite](https://github.com/intelligent-machine-learning/dlrover) | +| Aida | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range* | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer) | | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida) | +| GaLore | *Memory-Efficient LLM Training by Gradient Low-Rank Projection* | [github](https://github.com/jiaweizzhao/GaLore) | | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation) | +| Adalite | *Adalite optimizer* | [github](https://github.com/VatsaDev/adalite) | | [cite](https://github.com/VatsaDev/adalite) | +| bSAM | *SAM as an Optimal Relaxation of Bayes* | [github](https://github.com/team-approx-bayes/bayesian-sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation) | +| Schedule-Free | *Schedule-Free Optimizers* | [github](https://github.com/facebookresearch/schedule_free) | | [cite](https://github.com/facebookresearch/schedule_free) | +| FAdam | *Adam is a natural gradient optimizer using diagonal empirical Fisher information* | [github](https://github.com/lessw2020/fadam_pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation) | +| Grokfast | *Accelerated Grokking by Amplifying Slow Gradients* | [github](https://github.com/ironjr/grokfast) | | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation) | +| Kate | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad* | [github](https://github.com/nazya/KATE) | | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) | +| StableAdamW | *Stable and low-precision training for large-scale vision-language models* | | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230413013W/exportcitation) | +| AdamMini | *Use Fewer Learning Rates To Gain More* | [github](https://github.com/zyushun/Adam-mini) | | [cite](https://github.com/zyushun/Adam-mini?tab=readme-ov-file#citation) | +| TRAC | *Adaptive Parameter-free Optimization* | [github](https://github.com/ComputationalRobotics/TRAC) | | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240516642M/exportcitation) | ## Supported LR Scheduler @@ -336,7 +337,7 @@ If you use this software, please cite it below. Or you can get it from "cite thi month = jan, title = {{pytorch_optimizer: optimizer & lr scheduler & loss function collections in PyTorch}}, url = {https://github.com/kozistr/pytorch_optimizer}, - version = {3.0.1}, + version = {3.1.0}, year = {2021} } diff --git a/docs/optimizer.md b/docs/optimizer.md index bd4f4e6ac..0fb3628fc 100644 --- a/docs/optimizer.md +++ b/docs/optimizer.md @@ -328,6 +328,10 @@ :docstring: :members: +::: pytorch_optimizer.TRAC + :docstring: + :members: + ::: pytorch_optimizer.WSAM :docstring: :members: diff --git a/pyproject.toml b/pyproject.toml index 0242aaf7c..cdef27c6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ keywords = [ "GaLore", "Gravity", "GrokFast", "GSAM", "Kate", "Lamb", "LARS", "Lion", "LOMO", "Lookahead", "MADGRAD", "MSVAG", "Nero", "NovoGrad", "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21", "RotoGrad", "SAM", "ScheduleFreeSGD", "ScheduleFreeAdamW", "SGDP", "Shampoo", "ScalableShampoo", "SGDW", "SignSGD", - "SM3", "SopihaH", "SRMM", "StableAdamW", "SWATS", "Tiger", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal", + "SM3", "SopihaH", "SRMM", "StableAdamW", "SWATS", "Tiger", "TRAC", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal", "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard", "Bi-Tempered", "Tversky", "FocalTversky", "LovaszHinge", "bitsandbytes", "WSD", "QGaLore", ] diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py index 3628bb74e..25241f9f4 100644 --- a/pytorch_optimizer/__init__.py +++ b/pytorch_optimizer/__init__.py @@ -4,6 +4,7 @@ import torch.cuda from torch import nn +from torch.optim import AdamW from pytorch_optimizer.base.types import OPTIMIZER, PARAMETERS, SCHEDULER from pytorch_optimizer.loss.bi_tempered import BinaryBiTemperedLogisticLoss, BiTemperedLogisticLoss @@ -115,6 +116,7 @@ from pytorch_optimizer.optimizer.srmm import SRMM from pytorch_optimizer.optimizer.swats import SWATS from pytorch_optimizer.optimizer.tiger import Tiger +from pytorch_optimizer.optimizer.trac import TRAC from pytorch_optimizer.optimizer.utils import ( clip_grad_norm, disable_running_stats, @@ -131,6 +133,7 @@ HAS_Q_GALORE: bool = find_spec('q-galore-torch') is not None OPTIMIZER_LIST: List[OPTIMIZER] = [ + AdamW, AdaBelief, AdaBound, PID, @@ -350,6 +353,8 @@ def create_optimizer( if optimizer_name == 'alig': optimizer = optimizer(parameters, max_lr=lr, **kwargs) + elif optimizer_name in {'lomo', 'adalomo', 'adammini'}: + optimizer = optimizer(model, lr=lr, **kwargs) else: optimizer = optimizer(parameters, lr=lr, **kwargs) diff --git a/pytorch_optimizer/optimizer/lookahead.py b/pytorch_optimizer/optimizer/lookahead.py index 44a0be2ed..2291d50a2 100644 --- a/pytorch_optimizer/optimizer/lookahead.py +++ b/pytorch_optimizer/optimizer/lookahead.py @@ -22,7 +22,7 @@ def __init__( k: int = 5, alpha: float = 0.5, pullback_momentum: str = 'none', - ): + ) -> None: self.validate_positive(k, 'k') self.validate_range(alpha, 'alpha', 0.0, 1.0) self.validate_options(pullback_momentum, 'pullback_momentum', ['none', 'reset', 'pullback']) diff --git a/pytorch_optimizer/optimizer/trac.py b/pytorch_optimizer/optimizer/trac.py new file mode 100644 index 000000000..7dbc98e5a --- /dev/null +++ b/pytorch_optimizer/optimizer/trac.py @@ -0,0 +1,253 @@ +from typing import Callable, Dict, List, Tuple + +import torch +from torch import nn + +from pytorch_optimizer.base.optimizer import BaseOptimizer +from pytorch_optimizer.base.types import CLOSURE, DEFAULTS, LOSS, OPTIMIZER + + +def polyval(x: torch.Tensor, coef: torch.Tensor) -> torch.Tensor: + r"""Implement of the Horner scheme to evaluate a polynomial. + + taken from https://discuss.pytorch.org/t/polynomial-evaluation-by-horner-rule/67124 + + :param x: torch.Tensor. variable. + :param coef: torch.Tensor. coefficients of the polynomial. + """ + result = coef[0].clone() + + for c in coef[1:]: + result = (result * x) + c + + return result[0] + + +class ERF1994(nn.Module): + r"""Implementation of ERF1994. + + :param num_coefs: int. The number of polynomial coefficients to use in the approximation. + """ + + def __init__(self, num_coefs: int = 128) -> None: + super().__init__() + + self.n: int = num_coefs + + self.i: torch.Tensor = torch.complex(torch.tensor(0.0), torch.tensor(1.0)) + self.m = 2 * self.n + self.m2 = 2 * self.m + self.k = torch.linspace(-self.m + 1, self.m - 1, self.m2 - 1) + self.l = torch.sqrt(self.n / torch.sqrt(torch.tensor(2.0))) + self.theta = self.k * torch.pi / self.m + self.t = self.l * torch.tan(self.theta / 2.0) + self.f = torch.exp(-self.t ** 2) * (self.l ** 2 + self.t ** 2) # fmt: skip + self.a = torch.fft.fft(torch.fft.fftshift(self.f)).real / self.m2 + self.a = torch.flipud(self.a[1:self.n + 1]) # fmt: skip + + def w_algorithm(self, z: torch.Tensor) -> torch.Tensor: + r"""Compute the Faddeeva function of a complex number. + + :param z: torch.Tensor. A tensor of complex numbers. + """ + self.l = self.l.to(z.device) + self.i = self.i.to(z.device) + self.a = self.a.to(z.device) + + iz = self.i * z + lp_iz, ln_iz = self.l + iz, self.l - iz + + z_ = lp_iz / ln_iz + p = polyval(z_.unsqueeze(0), self.a) + return 2 * p / ln_iz.pow(2) + (1.0 / torch.sqrt(torch.tensor(torch.pi))) / ln_iz + + def forward(self, z: torch.Tensor) -> torch.Tensor: + r"""Compute the error function of a complex number. + + :param z: torch.Tensor. A tensor of complex numbers. + """ + sign_r = torch.sign(z.real) + sign_i = torch.sign(z.imag) + z = torch.complex(torch.abs(z.real), torch.abs(z.imag)) + out = -torch.exp(torch.log(self.w_algorithm(z * self.i)) - z ** 2) + 1 # fmt: skip + return torch.complex(out.real * sign_r, out.imag * sign_i) + + +class TRAC(BaseOptimizer): + r"""A Parameter-Free Optimizer for Lifelong Reinforcement Learning. + + Example: + ------- + Here's an example:: + + model = YourModel() + optimizer = TRAC(AdamW(model.parameters())) + + for input, output in data: + optimizer.zero_grad() + + loss = loss_fn(model(input), output) + loss.backward() + + optimizer.step() + + :param optimizer: Optimizer. base optimizer. + :param betas: List[float]. list of beta values. + :param num_coefs: int. the number of polynomial coefficients to use in the approximation. + :param s_prev: float. initial scale value. + :param eps: float. term added to the denominator to improve numerical stability. + """ + + def __init__( + self, + optimizer: OPTIMIZER, + betas: List[float] = (0.9, 0.99, 0.999, 0.9999, 0.99999, 0.999999), + num_coefs: int = 128, + s_prev: float = 1e-8, + eps: float = 1e-8, + ): + self.validate_positive(num_coefs, 'num_coefs') + self.validate_non_negative(s_prev, 's_prev') + self.validate_non_negative(eps, 'eps') + + self._optimizer_step_pre_hooks: Dict[int, Callable] = {} + self._optimizer_step_post_hooks: Dict[int, Callable] = {} + + self.erf = ERF1994(num_coefs=num_coefs) + self.betas = betas + self.s_prev = s_prev + self.eps = eps + + self.f_term = self.s_prev / self.erf_imag(1.0 / torch.sqrt(torch.tensor(2.0))) + + self.optimizer = optimizer + self.defaults: DEFAULTS = optimizer.defaults + + def __str__(self) -> str: + return 'TRAC' + + @property + def param_groups(self): + return self.optimizer.param_groups + + @property + def state(self): + return self.optimizer.state + + @torch.no_grad() + def reset(self): + device = self.param_groups[0]['params'][0].device + + self.state['trac'] = { + 'betas': torch.tensor(self.betas, device=device), + 's': torch.zeros(len(self.betas), device=device), + 'variance': torch.zeros(len(self.betas), device=device), + 'sigma': torch.full((len(self.betas),), 1e-8, device=device), + 'step': 0, + } + + for group in self.param_groups: + for p in group['params']: + self.state['trac'][p] = p.clone() + + @torch.no_grad() + def zero_grad(self) -> None: + self.optimizer.zero_grad(set_to_none=True) + + @torch.no_grad() + def erf_imag(self, x: torch.Tensor) -> torch.Tensor: + if not torch.is_floating_point(x): + x = x.to(torch.float32) + + ix = torch.complex(torch.zeros_like(x), x) + + return self.erf(ix).imag + + @torch.no_grad() + def backup_params_and_grads(self) -> Tuple[Dict, Dict]: + updates, grads = {}, {} + + for group in self.param_groups: + for p in group['params']: + updates[p] = p.clone() + grads[p] = p.grad.clone() if p.grad is not None else None + + return updates, grads + + @torch.no_grad() + def trac_step(self, updates: Dict, grads: Dict) -> None: + self.state['trac']['step'] += 1 + + deltas = {} + + device = self.param_groups[0]['params'][0].device + + h = torch.zeros((1,), device=device) + for group in self.param_groups: + for p in group['params']: + if grads[p] is None: + continue + + theta_ref = self.state['trac'][p] + update = updates[p] + + deltas[p] = (update - theta_ref) / torch.sum(self.state['trac']['s']).add_(self.eps) + update.neg_().add_(p) + + grad, delta = grads[p], deltas[p] + + product = torch.dot(delta.flatten(), grad.flatten()) + h.add_(product) + + delta.add_(update) + + s = self.state['trac']['s'] + betas = self.state['trac']['betas'] + variance = self.state['trac']['variance'] + sigma = self.state['trac']['sigma'] + + variance.mul_(betas.pow(2)).add_(h.pow(2)) + sigma.mul_(betas).sub_(h) + + s_term = self.erf_imag(sigma / (2.0 * variance).sqrt_().add_(self.eps)) + s_term.mul_(self.f_term) + s.copy_(s_term) + + scale = max(torch.sum(s), 0.0) + + for group in self.param_groups: + for p in group['params']: + if grads[p] is None: + continue + + delta = deltas[p] + delta.mul_(scale).add_(self.state['trac'][p]) + + p.copy_(delta) + + @torch.no_grad() + def step(self, closure: CLOSURE = None) -> LOSS: + # TODO: backup is first to get the delta of param and grad, but it does not work. + with torch.enable_grad(): + loss = self.optimizer.step(closure) + + updates, grads = self.backup_params_and_grads() + + if 'trac' not in self.state: + device = self.param_groups[0]['params'][0].device + + self.state['trac'] = { + 'betas': torch.tensor(self.betas, device=device), + 's': torch.zeros(len(self.betas), device=device), + 'variance': torch.zeros(len(self.betas), device=device), + 'sigma': torch.full((len(self.betas),), 1e-8, device=device), + 'step': 0, + } + + for group in self.param_groups: + for p in group['params']: + self.state['trac'][p] = updates[p].clone() + + self.trac_step(updates, grads) + + return loss diff --git a/tests/constants.py b/tests/constants.py index 498fbf46b..072645cee 100644 --- a/tests/constants.py +++ b/tests/constants.py @@ -85,6 +85,7 @@ 'wsam', 'pcgrad', 'lookahead', + 'trac', ] SPARSE_OPTIMIZERS: List[str] = ['madgrad', 'dadaptadagrad', 'sm3'] diff --git a/tests/test_create_optimizer.py b/tests/test_create_optimizer.py index 56d9c5914..93a3a0e16 100644 --- a/tests/test_create_optimizer.py +++ b/tests/test_create_optimizer.py @@ -9,6 +9,7 @@ def test_create_optimizer(): create_optimizer(model, 'adamp', lr=1e-2, weight_decay=1e-3, use_gc=True, use_lookahead=True) create_optimizer(model, 'alig', lr=1e-2, use_lookahead=True) + create_optimizer(model, 'adalomo', lr=1e-2, use_lookahead=False) def test_bnb_optimizer(): diff --git a/tests/test_general_optimizer_parameters.py b/tests/test_general_optimizer_parameters.py index 800b6e839..752615137 100644 --- a/tests/test_general_optimizer_parameters.py +++ b/tests/test_general_optimizer_parameters.py @@ -8,7 +8,7 @@ @pytest.mark.parametrize('optimizer_name', VALID_OPTIMIZER_NAMES) def test_learning_rate(optimizer_name): - if optimizer_name in ('alig', 'a2grad'): + if optimizer_name in {'alig', 'a2grad', 'adamw'}: pytest.skip(f'skip {optimizer_name} optimizer') optimizer = load_optimizer(optimizer_name) diff --git a/tests/test_gradients.py b/tests/test_gradients.py index 76e5a301f..14dd8be68 100644 --- a/tests/test_gradients.py +++ b/tests/test_gradients.py @@ -1,13 +1,13 @@ import pytest import torch -from pytorch_optimizer import SAM, WSAM, AdamP, Lookahead, load_optimizer +from pytorch_optimizer import SAM, TRAC, WSAM, AdamP, Lookahead, load_optimizer from pytorch_optimizer.base.exception import NoSparseGradientError from tests.constants import NO_SPARSE_OPTIMIZERS, SPARSE_OPTIMIZERS, VALID_OPTIMIZER_NAMES from tests.utils import build_environment, simple_parameter, simple_sparse_parameter, sphere_loss -@pytest.mark.parametrize('optimizer_name', [*VALID_OPTIMIZER_NAMES, 'lookahead']) +@pytest.mark.parametrize('optimizer_name', [*VALID_OPTIMIZER_NAMES, 'lookahead', 'trac']) def test_no_gradients(optimizer_name): if optimizer_name in {'lomo', 'adalomo', 'adammini'}: pytest.skip(f'skip {optimizer_name} optimizer.') @@ -25,7 +25,9 @@ def test_no_gradients(optimizer_name): elif optimizer_name in ('lamb', 'ralamb'): optimizer = load_optimizer(optimizer_name)(params, pre_norm=True) elif optimizer_name == 'lookahead': - optimizer = Lookahead(load_optimizer('adamp')(params), k=1) + optimizer = Lookahead(load_optimizer('adamw')(params), k=1) + elif optimizer_name == 'trac': + optimizer = TRAC(load_optimizer('adamw')(params)) else: optimizer = load_optimizer(optimizer_name)(params) @@ -33,13 +35,13 @@ def test_no_gradients(optimizer_name): sphere_loss(p1 + p3).backward(create_graph=True) optimizer.step(lambda: 0.1) # for AliG optimizer - if optimizer_name != 'lookahead': + if optimizer_name not in {'lookahead', 'trac'}: optimizer.zero_grad(set_to_none=True) @pytest.mark.parametrize('no_sparse_optimizer', NO_SPARSE_OPTIMIZERS) def test_sparse_not_supported(no_sparse_optimizer): - if no_sparse_optimizer in {'lomo', 'adalomo', 'bsam', 'adammini'}: + if no_sparse_optimizer in {'lomo', 'adalomo', 'bsam', 'adammini', 'adamw'}: pytest.skip(f'skip {no_sparse_optimizer} optimizer.') param = simple_sparse_parameter()[1] diff --git a/tests/test_load_modules.py b/tests/test_load_modules.py index 8b5fc4d0d..9a23043ca 100644 --- a/tests/test_load_modules.py +++ b/tests/test_load_modules.py @@ -38,7 +38,7 @@ def test_load_lr_scheduler_invalid(invalid_lr_scheduler_names): def test_get_supported_optimizers(): - assert len(get_supported_optimizers()) == 72 + assert len(get_supported_optimizers()) == 73 def test_get_supported_lr_schedulers(): diff --git a/tests/test_optimizer_parameters.py b/tests/test_optimizer_parameters.py index 433b3d988..c83e11cbe 100644 --- a/tests/test_optimizer_parameters.py +++ b/tests/test_optimizer_parameters.py @@ -52,7 +52,7 @@ def test_came_parameters(): def test_pcgrad_parameters(): - opt = load_optimizer('adamp')([simple_parameter()]) + opt = load_optimizer('adamw')([simple_parameter()]) # test reduction for reduction in ['mean', 'sum']: @@ -64,16 +64,16 @@ def test_pcgrad_parameters(): def test_sam_parameters(): with pytest.raises(ValueError): - SAM(None, load_optimizer('adamp'), rho=-0.1) + SAM(None, load_optimizer('adamw'), rho=-0.1) def test_wsam_parameters(): with pytest.raises(ValueError): - WSAM(None, None, load_optimizer('adamp'), rho=-0.1) + WSAM(None, None, load_optimizer('adamw'), rho=-0.1) def test_lookahead_parameters(): - optimizer = load_optimizer('adamp')([simple_parameter()]) + optimizer = load_optimizer('adamw')([simple_parameter()]) for pullback_momentum in PULLBACK_MOMENTUM: opt = Lookahead(optimizer, pullback_momentum=pullback_momentum) diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py index 6501de3ee..aaf9024e1 100644 --- a/tests/test_optimizers.py +++ b/tests/test_optimizers.py @@ -7,6 +7,7 @@ BSAM, GSAM, SAM, + TRAC, WSAM, CosineScheduler, DynamicLossScaler, @@ -667,3 +668,37 @@ def test_adam_mini_optimizer(environment): optimizer = load_optimizer('AdamMini')(model) optimizer.reset() optimizer.step() + + +def test_trac_optimizer(environment): + (x_data, y_data), model, loss_fn = environment + + optimizer = TRAC(load_optimizer('adamw')(model.parameters(), lr=1e0)) + + init_loss, loss = np.inf, np.inf + for _ in range(3): + loss = loss_fn(model(x_data), y_data) + + if init_loss == np.inf: + init_loss = loss + + loss.backward() + + optimizer.step() + optimizer.zero_grad() + + assert tensor_to_numpy(init_loss) > 2.0 * tensor_to_numpy(loss) + + +def test_trac_optimizer_erf_imag(): + model = Example() + + optimizer = TRAC(load_optimizer('adamw')(model.parameters())) + + optimizer.reset() + optimizer.zero_grad() + + complex_tensor = torch.complex(torch.tensor(0.0), torch.tensor(1.0)) + optimizer.erf_imag(complex_tensor) + + assert str(optimizer).lower() == 'trac'