diff --git a/README.md b/README.md
index 89095bbe2..c06903f99 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 
 **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. 
 I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas.  
-Currently, **73 optimizers (+ `bitsandbytes`, `qgalore`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
+Currently, **74 optimizers (+ `bitsandbytes`, `qgalore`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
 
 Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer).
 
@@ -94,83 +94,84 @@ from pytorch_optimizer import get_supported_optimizers
 supported_optimizers = get_supported_optimizers()
 ```
 
-| Optimizer     | Description                                                                                      | Official Code                                                                                                  | Paper                                                                                      | Citation                                                                                                                            |
-|---------------|--------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
-| AdaBelief     | *Adapting Step-sizes by the Belief in Observed Gradients*                                        | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer)                                                | <https://arxiv.org/abs/2010.07468>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation)                                                        |
-| AdaBound      | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate*                                  | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py)                                   | <https://openreview.net/forum?id=Bkg3g2R9FX>                                               | [cite](https://github.com/Luolc/AdaBound#citing)                                                                                    |
-| AdaHessian    | *An Adaptive Second Order Optimizer for Machine Learning*                                        | [github](https://github.com/amirgholami/adahessian)                                                            | <https://arxiv.org/abs/2006.00719>                                                         | [cite](https://github.com/amirgholami/adahessian#citation)                                                                          |
-| AdamD         | *Improved bias-correction in Adam*                                                               |                                                                                                                | <https://arxiv.org/abs/2110.10828>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation)                                                        |
-| AdamP         | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights*                   | [github](https://github.com/clovaai/AdamP)                                                                     | <https://arxiv.org/abs/2006.08217>                                                         | [cite](https://github.com/clovaai/AdamP#how-to-cite)                                                                                |
-| diffGrad      | *An Optimization Method for Convolutional Neural Networks*                                       | [github](https://github.com/shivram1987/diffGrad)                                                              | <https://arxiv.org/abs/1909.11015v3>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation)                                                        |
-| MADGRAD       | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic*                         | [github](https://github.com/facebookresearch/madgrad)                                                          | <https://arxiv.org/abs/2101.11075>                                                         | [cite](https://github.com/facebookresearch/madgrad#tech-report)                                                                     |
-| RAdam         | *On the Variance of the Adaptive Learning Rate and Beyond*                                       | [github](https://github.com/LiyuanLucasLiu/RAdam)                                                              | <https://arxiv.org/abs/1908.03265>                                                         | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation)                                                                            |
-| Ranger        | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer*             | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer)                                          | <https://bit.ly/3zyspC3>                                                                   | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work)                                                |
-| Ranger21      | *a synergistic deep learning optimizer*                                                          | [github](https://github.com/lessw2020/Ranger21)                                                                | <https://arxiv.org/abs/2106.13731>                                                         | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work)                                                                 |
-| Lamb          | *Large Batch Optimization for Deep Learning*                                                     | [github](https://github.com/cybertronai/pytorch-lamb)                                                          | <https://arxiv.org/abs/1904.00962>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation)                                                        |
-| Shampoo       | *Preconditioned Stochastic Tensor Optimization*                                                  | [github](https://github.com/moskomule/shampoo.pytorch)                                                         | <https://arxiv.org/abs/1802.09568>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation)                                                        |
-| Nero          | *Learning by Turning: Neural Architecture Aware Optimisation*                                    | [github](https://github.com/jxbz/nero)                                                                         | <https://arxiv.org/abs/2102.07227>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation)                                                        |
-| Adan          | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models*                         | [github](https://github.com/sail-sg/Adan)                                                                      | <https://arxiv.org/abs/2208.06677>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation)                                                        |
-| Adai          | *Disentangling the Effects of Adaptive Learning Rate and Momentum*                               | [github](https://github.com/zeke-xie/adaptive-inertia-adai)                                                    | <https://arxiv.org/abs/2006.15815>                                                         | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing)                                                                    |
-| SAM           | *Sharpness-Aware Minimization*                                                                   | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2010.01412>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation)                                                        |
-| ASAM          | *Adaptive Sharpness-Aware Minimization*                                                          | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2102.11600>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation)                                                        |
-| GSAM          | *Surrogate Gap Guided Sharpness-Aware Minimization*                                              | [github](https://github.com/juntang-zhuang/GSAM)                                                               | <https://openreview.net/pdf?id=edONMAnhLu->                                                | [cite](https://github.com/juntang-zhuang/GSAM#citation)                                                                             |
-| D-Adaptation  | *Learning-Rate-Free Learning by D-Adaptation*                                                    | [github](https://github.com/facebookresearch/dadaptation)                                                      | <https://arxiv.org/abs/2301.07733>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation)                                                        |
-| AdaFactor     | *Adaptive Learning Rates with Sublinear Memory Cost*                                             | [github](https://github.com/DeadAt0m/adafactor-pytorch)                                                        | <https://arxiv.org/abs/1804.04235>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation)                                                        |
-| Apollo        | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization*  | [github](https://github.com/XuezheMax/apollo)                                                                  | <https://arxiv.org/abs/2009.13586>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation)                                                        |
-| NovoGrad      | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks*     | [github](https://github.com/lonePatient/NovoGrad-pytorch)                                                      | <https://arxiv.org/abs/1905.11286>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation)                                                        |
-| Lion          | *Symbolic Discovery of Optimization Algorithms*                                                  | [github](https://github.com/google/automl/tree/master/lion)                                                    | <https://arxiv.org/abs/2302.06675>                                                         | [cite](https://github.com/google/automl/tree/master/lion#citation)                                                                  |
-| Ali-G         | *Adaptive Learning Rates for Interpolation with Gradients*                                       | [github](https://github.com/oval-group/ali-g)                                                                  | <https://arxiv.org/abs/1906.05661>                                                         | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients)                                |
-| SM3           | *Memory-Efficient Adaptive Optimization*                                                         | [github](https://github.com/google-research/google-research/tree/master/sm3)                                   | <https://arxiv.org/abs/1901.11150>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation)                                                        |
-| AdaNorm       | *Adaptive Gradient Norm Correction based Optimizer for CNNs*                                     | [github](https://github.com/shivram1987/AdaNorm)                                                               | <https://arxiv.org/abs/2210.06364>                                                         | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation)                                                                   |
-| RotoGrad      | *Gradient Homogenization in Multitask Learning*                                                  | [github](https://github.com/adrianjav/rotograd)                                                                | <https://openreview.net/pdf?id=T8wHz4rnuGL>                                                | [cite](https://github.com/adrianjav/rotograd#citing)                                                                                |
-| A2Grad        | *Optimal Adaptive and Accelerated Stochastic Gradient Descent*                                   | [github](https://github.com/severilov/A2Grad_optimizer)                                                        | <https://arxiv.org/abs/1810.00553>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation)                                                        |
-| AccSGD        | *Accelerating Stochastic Gradient Descent For Least Squares Regression*                          | [github](https://github.com/rahulkidambi/AccSGD)                                                               | <https://arxiv.org/abs/1704.08227>                                                         | [cite](https://github.com/rahulkidambi/AccSGD#citation)                                                                             |
-| SGDW          | *Decoupled Weight Decay Regularization*                                                          | [github](https://github.com/loshchil/AdamW-and-SGDW)                                                           | <https://arxiv.org/abs/1711.05101>                                                         | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact)                                                                          |
-| ASGD          | *Adaptive Gradient Descent without Descent*                                                      | [github](https://github.com/ymalitsky/adaptive_GD)                                                             | <https://arxiv.org/abs/1910.09529>                                                         | [cite](https://github.com/ymalitsky/adaptive_GD#reference)                                                                          |
-| Yogi          | *Adaptive Methods for Nonconvex Optimization*                                                    |                                                                                                                | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html)                   |
-| SWATS         | *Improving Generalization Performance by Switching from Adam to SGD*                             |                                                                                                                | <https://arxiv.org/abs/1712.07628>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation)                                                        |
-| Fromage       | *On the distance between two neural networks and the stability of learning*                      | [github](https://github.com/jxbz/fromage)                                                                      | <https://arxiv.org/abs/2002.03432>                                                         | [cite](https://github.com/jxbz/fromage#citation)                                                                                    |
-| MSVAG         | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients*                      | [github](https://github.com/lballes/msvag)                                                                     | <https://arxiv.org/abs/1705.07774>                                                         | [cite](https://github.com/lballes/msvag#citation)                                                                                   |
-| AdaMod        | *An Adaptive and Momental Bound Method for Stochastic Learning*                                  | [github](https://github.com/lancopku/AdaMod)                                                                   | <https://arxiv.org/abs/1910.12249>                                                         | [cite](https://github.com/lancopku/AdaMod#citation)                                                                                 |
-| AggMo         | *Aggregated Momentum: Stability Through Passive Damping*                                         | [github](https://github.com/AtheMathmo/AggMo)                                                                  | <https://arxiv.org/abs/1804.00325>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation)                                                        |
-| QHAdam        | *Quasi-hyperbolic momentum and Adam for deep learning*                                           | [github](https://github.com/facebookresearch/qhoptim)                                                          | <https://arxiv.org/abs/1810.06801>                                                         | [cite](https://github.com/facebookresearch/qhoptim#reference)                                                                       |
-| PID           | *A PID Controller Approach for Stochastic Optimization of Deep Networks*                         | [github](https://github.com/tensorboy/PIDOptimizer)                                                            | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf)                    | [cite](https://github.com/tensorboy/PIDOptimizer#citation)                                                                          |
-| Gravity       | *a Kinematic Approach on Optimization in Deep Learning*                                          | [github](https://github.com/dariush-bahrami/gravity.optimizer)                                                 | <https://arxiv.org/abs/2101.09192>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation)                                                        |
-| AdaSmooth     | *An Adaptive Learning Rate Method based on Effective Ratio*                                      |                                                                                                                | <https://arxiv.org/abs/2204.00825v1>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation)                                                        |
+| Optimizer     | Description                                                                                       | Official Code                                                                                                  | Paper                                                                                      | Citation                                                                                                                            |
+|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
+| AdaBelief     | *Adapting Step-sizes by the Belief in Observed Gradients*                                         | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer)                                                | <https://arxiv.org/abs/2010.07468>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation)                                                        |
+| AdaBound      | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate*                                   | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py)                                   | <https://openreview.net/forum?id=Bkg3g2R9FX>                                               | [cite](https://github.com/Luolc/AdaBound#citing)                                                                                    |
+| AdaHessian    | *An Adaptive Second Order Optimizer for Machine Learning*                                         | [github](https://github.com/amirgholami/adahessian)                                                            | <https://arxiv.org/abs/2006.00719>                                                         | [cite](https://github.com/amirgholami/adahessian#citation)                                                                          |
+| AdamD         | *Improved bias-correction in Adam*                                                                |                                                                                                                | <https://arxiv.org/abs/2110.10828>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation)                                                        |
+| AdamP         | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights*                    | [github](https://github.com/clovaai/AdamP)                                                                     | <https://arxiv.org/abs/2006.08217>                                                         | [cite](https://github.com/clovaai/AdamP#how-to-cite)                                                                                |
+| diffGrad      | *An Optimization Method for Convolutional Neural Networks*                                        | [github](https://github.com/shivram1987/diffGrad)                                                              | <https://arxiv.org/abs/1909.11015v3>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation)                                                        |
+| MADGRAD       | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic*                          | [github](https://github.com/facebookresearch/madgrad)                                                          | <https://arxiv.org/abs/2101.11075>                                                         | [cite](https://github.com/facebookresearch/madgrad#tech-report)                                                                     |
+| RAdam         | *On the Variance of the Adaptive Learning Rate and Beyond*                                        | [github](https://github.com/LiyuanLucasLiu/RAdam)                                                              | <https://arxiv.org/abs/1908.03265>                                                         | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation)                                                                            |
+| Ranger        | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer*              | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer)                                          | <https://bit.ly/3zyspC3>                                                                   | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work)                                                |
+| Ranger21      | *a synergistic deep learning optimizer*                                                           | [github](https://github.com/lessw2020/Ranger21)                                                                | <https://arxiv.org/abs/2106.13731>                                                         | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work)                                                                 |
+| Lamb          | *Large Batch Optimization for Deep Learning*                                                      | [github](https://github.com/cybertronai/pytorch-lamb)                                                          | <https://arxiv.org/abs/1904.00962>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation)                                                        |
+| Shampoo       | *Preconditioned Stochastic Tensor Optimization*                                                   | [github](https://github.com/moskomule/shampoo.pytorch)                                                         | <https://arxiv.org/abs/1802.09568>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation)                                                        |
+| Nero          | *Learning by Turning: Neural Architecture Aware Optimisation*                                     | [github](https://github.com/jxbz/nero)                                                                         | <https://arxiv.org/abs/2102.07227>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation)                                                        |
+| Adan          | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models*                          | [github](https://github.com/sail-sg/Adan)                                                                      | <https://arxiv.org/abs/2208.06677>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation)                                                        |
+| Adai          | *Disentangling the Effects of Adaptive Learning Rate and Momentum*                                | [github](https://github.com/zeke-xie/adaptive-inertia-adai)                                                    | <https://arxiv.org/abs/2006.15815>                                                         | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing)                                                                    |
+| SAM           | *Sharpness-Aware Minimization*                                                                    | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2010.01412>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation)                                                        |
+| ASAM          | *Adaptive Sharpness-Aware Minimization*                                                           | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2102.11600>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation)                                                        |
+| GSAM          | *Surrogate Gap Guided Sharpness-Aware Minimization*                                               | [github](https://github.com/juntang-zhuang/GSAM)                                                               | <https://openreview.net/pdf?id=edONMAnhLu->                                                | [cite](https://github.com/juntang-zhuang/GSAM#citation)                                                                             |
+| D-Adaptation  | *Learning-Rate-Free Learning by D-Adaptation*                                                     | [github](https://github.com/facebookresearch/dadaptation)                                                      | <https://arxiv.org/abs/2301.07733>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation)                                                        |
+| AdaFactor     | *Adaptive Learning Rates with Sublinear Memory Cost*                                              | [github](https://github.com/DeadAt0m/adafactor-pytorch)                                                        | <https://arxiv.org/abs/1804.04235>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation)                                                        |
+| Apollo        | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization*   | [github](https://github.com/XuezheMax/apollo)                                                                  | <https://arxiv.org/abs/2009.13586>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation)                                                        |
+| NovoGrad      | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks*      | [github](https://github.com/lonePatient/NovoGrad-pytorch)                                                      | <https://arxiv.org/abs/1905.11286>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation)                                                        |
+| Lion          | *Symbolic Discovery of Optimization Algorithms*                                                   | [github](https://github.com/google/automl/tree/master/lion)                                                    | <https://arxiv.org/abs/2302.06675>                                                         | [cite](https://github.com/google/automl/tree/master/lion#citation)                                                                  |
+| Ali-G         | *Adaptive Learning Rates for Interpolation with Gradients*                                        | [github](https://github.com/oval-group/ali-g)                                                                  | <https://arxiv.org/abs/1906.05661>                                                         | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients)                                |
+| SM3           | *Memory-Efficient Adaptive Optimization*                                                          | [github](https://github.com/google-research/google-research/tree/master/sm3)                                   | <https://arxiv.org/abs/1901.11150>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation)                                                        |
+| AdaNorm       | *Adaptive Gradient Norm Correction based Optimizer for CNNs*                                      | [github](https://github.com/shivram1987/AdaNorm)                                                               | <https://arxiv.org/abs/2210.06364>                                                         | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation)                                                                   |
+| RotoGrad      | *Gradient Homogenization in Multitask Learning*                                                   | [github](https://github.com/adrianjav/rotograd)                                                                | <https://openreview.net/pdf?id=T8wHz4rnuGL>                                                | [cite](https://github.com/adrianjav/rotograd#citing)                                                                                |
+| A2Grad        | *Optimal Adaptive and Accelerated Stochastic Gradient Descent*                                    | [github](https://github.com/severilov/A2Grad_optimizer)                                                        | <https://arxiv.org/abs/1810.00553>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation)                                                        |
+| AccSGD        | *Accelerating Stochastic Gradient Descent For Least Squares Regression*                           | [github](https://github.com/rahulkidambi/AccSGD)                                                               | <https://arxiv.org/abs/1704.08227>                                                         | [cite](https://github.com/rahulkidambi/AccSGD#citation)                                                                             |
+| SGDW          | *Decoupled Weight Decay Regularization*                                                           | [github](https://github.com/loshchil/AdamW-and-SGDW)                                                           | <https://arxiv.org/abs/1711.05101>                                                         | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact)                                                                          |
+| ASGD          | *Adaptive Gradient Descent without Descent*                                                       | [github](https://github.com/ymalitsky/adaptive_GD)                                                             | <https://arxiv.org/abs/1910.09529>                                                         | [cite](https://github.com/ymalitsky/adaptive_GD#reference)                                                                          |
+| Yogi          | *Adaptive Methods for Nonconvex Optimization*                                                     |                                                                                                                | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html)                   |
+| SWATS         | *Improving Generalization Performance by Switching from Adam to SGD*                              |                                                                                                                | <https://arxiv.org/abs/1712.07628>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation)                                                        |
+| Fromage       | *On the distance between two neural networks and the stability of learning*                       | [github](https://github.com/jxbz/fromage)                                                                      | <https://arxiv.org/abs/2002.03432>                                                         | [cite](https://github.com/jxbz/fromage#citation)                                                                                    |
+| MSVAG         | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients*                       | [github](https://github.com/lballes/msvag)                                                                     | <https://arxiv.org/abs/1705.07774>                                                         | [cite](https://github.com/lballes/msvag#citation)                                                                                   |
+| AdaMod        | *An Adaptive and Momental Bound Method for Stochastic Learning*                                   | [github](https://github.com/lancopku/AdaMod)                                                                   | <https://arxiv.org/abs/1910.12249>                                                         | [cite](https://github.com/lancopku/AdaMod#citation)                                                                                 |
+| AggMo         | *Aggregated Momentum: Stability Through Passive Damping*                                          | [github](https://github.com/AtheMathmo/AggMo)                                                                  | <https://arxiv.org/abs/1804.00325>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation)                                                        |
+| QHAdam        | *Quasi-hyperbolic momentum and Adam for deep learning*                                            | [github](https://github.com/facebookresearch/qhoptim)                                                          | <https://arxiv.org/abs/1810.06801>                                                         | [cite](https://github.com/facebookresearch/qhoptim#reference)                                                                       |
+| PID           | *A PID Controller Approach for Stochastic Optimization of Deep Networks*                          | [github](https://github.com/tensorboy/PIDOptimizer)                                                            | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf)                    | [cite](https://github.com/tensorboy/PIDOptimizer#citation)                                                                          |
+| Gravity       | *a Kinematic Approach on Optimization in Deep Learning*                                           | [github](https://github.com/dariush-bahrami/gravity.optimizer)                                                 | <https://arxiv.org/abs/2101.09192>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation)                                                        |
+| AdaSmooth     | *An Adaptive Learning Rate Method based on Effective Ratio*                                       |                                                                                                                | <https://arxiv.org/abs/2204.00825v1>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation)                                                        |
 | SRMM          | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM)                                                                   | <https://arxiv.org/abs/2201.01652>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation)                                                        |
-| AvaGrad       | *Domain-independent Dominance of Adaptive Methods*                                               | [github](https://github.com/lolemacs/avagrad)                                                                  | <https://arxiv.org/abs/1912.01823>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation)                                                        |
-| PCGrad        | *Gradient Surgery for Multi-Task Learning*                                                       | [github](https://github.com/tianheyu927/PCGrad)                                                                | <https://arxiv.org/abs/2001.06782>                                                         | [cite](https://github.com/tianheyu927/PCGrad#reference)                                                                             |
-| AMSGrad       | *On the Convergence of Adam and Beyond*                                                          |                                                                                                                | <https://openreview.net/pdf?id=ryQu7f-RZ>                                                  | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation)                                                        |
-| Lookahead     | *k steps forward, 1 step back*                                                                   | [github](https://github.com/pytorch/examples/tree/main/imagenet)                                               | <https://arxiv.org/abs/1907.08610>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation)                                                        |
-| PNM           | *Manipulating Stochastic Gradient Noise to Improve Generalization*                               | [github](https://github.com/zeke-xie/Positive-Negative-Momentum)                                               | <https://arxiv.org/abs/2103.17182>                                                         | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing)                                                               |
-| GC            | *Gradient Centralization*                                                                        | [github](https://github.com/Yonghongwei/Gradient-Centralization)                                               | <https://arxiv.org/abs/2004.01461>                                                         | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation)                                                             |
-| AGC           | *Adaptive Gradient Clipping*                                                                     | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets)                                     | <https://arxiv.org/abs/2102.06171>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation)                                                        |
-| Stable WD     | *Understanding and Scheduling Weight Decay*                                                      | [github](https://github.com/zeke-xie/stable-weight-decay-regularization)                                       | <https://arxiv.org/abs/2011.11152>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation)                                                        |
-| Softplus T    | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM*                          |                                                                                                                | <https://arxiv.org/abs/1908.00700>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation)                                                        |
-| Un-tuned w/u  | *On the adequacy of untuned warmup for adaptive optimization*                                    |                                                                                                                | <https://arxiv.org/abs/1910.04209>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation)                                                        |
-| Norm Loss     | *An efficient yet effective regularization method for deep neural networks*                      |                                                                                                                | <https://arxiv.org/abs/2103.06583>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation)                                                        |
-| AdaShift      | *Decorrelation and Convergence of Adaptive Learning Rate Methods*                                | [github](https://github.com/MichaelKonobeev/adashift)                                                          | <https://arxiv.org/abs/1810.00143v4>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation)                                                        |
-| AdaDelta      | *An Adaptive Learning Rate Method*                                                               |                                                                                                                | <https://arxiv.org/abs/1212.5701v1>                                                        | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation)                                                        |
-| Amos          | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale*                | [github](https://github.com/google-research/jestimator)                                                        | <https://arxiv.org/abs/2210.11693>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation)                                                        |
-| SignSGD       | *Compressed Optimisation for Non-Convex Problems*                                                | [github](https://github.com/jxbz/signSGD)                                                                      | <https://arxiv.org/abs/1802.04434>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation)                                                        |
-| Sophia        | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training*                   | [github](https://github.com/Liuhong99/Sophia)                                                                  | <https://arxiv.org/abs/2305.14342>                                                         | [cite](https://github.com/Liuhong99/Sophia)                                                                                         |
-| Prodigy       | *An Expeditiously Adaptive Parameter-Free Learner*                                               | [github](https://github.com/konstmish/prodigy)                                                                 | <https://arxiv.org/abs/2306.06101>                                                         | [cite](https://github.com/konstmish/prodigy#how-to-cite)                                                                            |
-| PAdam         | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks*   | [github](https://github.com/uclaml/Padam)                                                                      | <https://arxiv.org/abs/1806.06763>                                                         | [cite](https://github.com/uclaml/Padam#citation)                                                                                    |
-| LOMO          | *Full Parameter Fine-tuning for Large Language Models with Limited Resources*                    | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2306.09782>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
-| AdaLOMO       | *Low-memory Optimization with Adaptive Learning Rate*                     | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2310.10195>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
-| Tiger         | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious*                      | [github](https://github.com/bojone/tiger)                                                                      |                                                                                            | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation)                                                             |
-| CAME          | *Confidence-guided Adaptive Memory Efficient Optimization*                                       | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME)                            | <https://aclanthology.org/2023.acl-long.243/>                                              | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation)                                          |
-| WSAM          | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term*            | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | <https://arxiv.org/abs/2305.15817>                                                         | [cite](https://github.com/intelligent-machine-learning/dlrover)                                                                     |
-| Aida          | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range*     | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer)                                                   | <https://arxiv.org/abs/2203.13273>                                                         | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida)                           |
-| GaLore        | *Memory-Efficient LLM Training by Gradient Low-Rank Projection*                                  | [github](https://github.com/jiaweizzhao/GaLore)                                                                | <https://arxiv.org/abs/2403.03507>                                                         | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation)                                               |
-| Adalite       | *Adalite optimizer*                                                                              | [github](https://github.com/VatsaDev/adalite)                                                                  | <https://github.com/VatsaDev/adalite>                                                      | [cite](https://github.com/VatsaDev/adalite)                                                                                         |
-| bSAM          | *SAM as an Optimal Relaxation of Bayes*                                                          | [github](https://github.com/team-approx-bayes/bayesian-sam)                                                    | <https://arxiv.org/abs/2210.01620>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation)                                                        |
-| Schedule-Free | *Schedule-Free Optimizers*                                                                       | [github](https://github.com/facebookresearch/schedule_free)                                                    | <https://github.com/facebookresearch/schedule_free>                                        | [cite](https://github.com/facebookresearch/schedule_free)                                                                           |
-| FAdam         | *Adam is a natural gradient optimizer using diagonal empirical Fisher information*               | [github](https://github.com/lessw2020/fadam_pytorch)                                                           | <https://arxiv.org/abs/2405.12807>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation)                                                        |
-| Grokfast      | *Accelerated Grokking by Amplifying Slow Gradients*                                              | [github](https://github.com/ironjr/grokfast)                                                                   | <https://arxiv.org/abs/2405.20233>                                                         | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation)                                                              |
-| Kate          | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad*                    | [github](https://github.com/nazya/KATE)                                                                        | <https://arxiv.org/abs/2403.02648>                                                         | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) |
-| StableAdamW   | *Stable and low-precision training for large-scale vision-language models*                       |                                                                                                                | <https://arxiv.org/abs/2304.13013>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230413013W/exportcitation)                                                        |
-| AdamMini      | *Use Fewer Learning Rates To Gain More*                                                          | [github](https://github.com/zyushun/Adam-mini)                                                                 | <https://arxiv.org/abs/2406.16793>                                                         | [cite](https://github.com/zyushun/Adam-mini?tab=readme-ov-file#citation)                                                            |
+| AvaGrad       | *Domain-independent Dominance of Adaptive Methods*                                                | [github](https://github.com/lolemacs/avagrad)                                                                  | <https://arxiv.org/abs/1912.01823>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation)                                                        |
+| PCGrad        | *Gradient Surgery for Multi-Task Learning*                                                        | [github](https://github.com/tianheyu927/PCGrad)                                                                | <https://arxiv.org/abs/2001.06782>                                                         | [cite](https://github.com/tianheyu927/PCGrad#reference)                                                                             |
+| AMSGrad       | *On the Convergence of Adam and Beyond*                                                           |                                                                                                                | <https://openreview.net/pdf?id=ryQu7f-RZ>                                                  | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation)                                                        |
+| Lookahead     | *k steps forward, 1 step back*                                                                    | [github](https://github.com/pytorch/examples/tree/main/imagenet)                                               | <https://arxiv.org/abs/1907.08610>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation)                                                        |
+| PNM           | *Manipulating Stochastic Gradient Noise to Improve Generalization*                                | [github](https://github.com/zeke-xie/Positive-Negative-Momentum)                                               | <https://arxiv.org/abs/2103.17182>                                                         | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing)                                                               |
+| GC            | *Gradient Centralization*                                                                         | [github](https://github.com/Yonghongwei/Gradient-Centralization)                                               | <https://arxiv.org/abs/2004.01461>                                                         | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation)                                                             |
+| AGC           | *Adaptive Gradient Clipping*                                                                      | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets)                                     | <https://arxiv.org/abs/2102.06171>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation)                                                        |
+| Stable WD     | *Understanding and Scheduling Weight Decay*                                                       | [github](https://github.com/zeke-xie/stable-weight-decay-regularization)                                       | <https://arxiv.org/abs/2011.11152>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation)                                                        |
+| Softplus T    | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM*                           |                                                                                                                | <https://arxiv.org/abs/1908.00700>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation)                                                        |
+| Un-tuned w/u  | *On the adequacy of untuned warmup for adaptive optimization*                                     |                                                                                                                | <https://arxiv.org/abs/1910.04209>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation)                                                        |
+| Norm Loss     | *An efficient yet effective regularization method for deep neural networks*                       |                                                                                                                | <https://arxiv.org/abs/2103.06583>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation)                                                        |
+| AdaShift      | *Decorrelation and Convergence of Adaptive Learning Rate Methods*                                 | [github](https://github.com/MichaelKonobeev/adashift)                                                          | <https://arxiv.org/abs/1810.00143v4>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation)                                                        |
+| AdaDelta      | *An Adaptive Learning Rate Method*                                                                |                                                                                                                | <https://arxiv.org/abs/1212.5701v1>                                                        | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation)                                                        |
+| Amos          | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale*                 | [github](https://github.com/google-research/jestimator)                                                        | <https://arxiv.org/abs/2210.11693>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation)                                                        |
+| SignSGD       | *Compressed Optimisation for Non-Convex Problems*                                                 | [github](https://github.com/jxbz/signSGD)                                                                      | <https://arxiv.org/abs/1802.04434>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation)                                                        |
+| Sophia        | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training*                    | [github](https://github.com/Liuhong99/Sophia)                                                                  | <https://arxiv.org/abs/2305.14342>                                                         | [cite](https://github.com/Liuhong99/Sophia)                                                                                         |
+| Prodigy       | *An Expeditiously Adaptive Parameter-Free Learner*                                                | [github](https://github.com/konstmish/prodigy)                                                                 | <https://arxiv.org/abs/2306.06101>                                                         | [cite](https://github.com/konstmish/prodigy#how-to-cite)                                                                            |
+| PAdam         | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks*    | [github](https://github.com/uclaml/Padam)                                                                      | <https://arxiv.org/abs/1806.06763>                                                         | [cite](https://github.com/uclaml/Padam#citation)                                                                                    |
+| LOMO          | *Full Parameter Fine-tuning for Large Language Models with Limited Resources*                     | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2306.09782>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
+| AdaLOMO       | *Low-memory Optimization with Adaptive Learning Rate*                                             | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2310.10195>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
+| Tiger         | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious*                       | [github](https://github.com/bojone/tiger)                                                                      |                                                                                            | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation)                                                             |
+| CAME          | *Confidence-guided Adaptive Memory Efficient Optimization*                                        | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME)                            | <https://aclanthology.org/2023.acl-long.243/>                                              | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation)                                          |
+| WSAM          | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term*             | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | <https://arxiv.org/abs/2305.15817>                                                         | [cite](https://github.com/intelligent-machine-learning/dlrover)                                                                     |
+| Aida          | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range*      | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer)                                                   | <https://arxiv.org/abs/2203.13273>                                                         | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida)                           |
+| GaLore        | *Memory-Efficient LLM Training by Gradient Low-Rank Projection*                                   | [github](https://github.com/jiaweizzhao/GaLore)                                                                | <https://arxiv.org/abs/2403.03507>                                                         | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation)                                               |
+| Adalite       | *Adalite optimizer*                                                                               | [github](https://github.com/VatsaDev/adalite)                                                                  | <https://github.com/VatsaDev/adalite>                                                      | [cite](https://github.com/VatsaDev/adalite)                                                                                         |
+| bSAM          | *SAM as an Optimal Relaxation of Bayes*                                                           | [github](https://github.com/team-approx-bayes/bayesian-sam)                                                    | <https://arxiv.org/abs/2210.01620>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation)                                                        |
+| Schedule-Free | *Schedule-Free Optimizers*                                                                        | [github](https://github.com/facebookresearch/schedule_free)                                                    | <https://github.com/facebookresearch/schedule_free>                                        | [cite](https://github.com/facebookresearch/schedule_free)                                                                           |
+| FAdam         | *Adam is a natural gradient optimizer using diagonal empirical Fisher information*                | [github](https://github.com/lessw2020/fadam_pytorch)                                                           | <https://arxiv.org/abs/2405.12807>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation)                                                        |
+| Grokfast      | *Accelerated Grokking by Amplifying Slow Gradients*                                               | [github](https://github.com/ironjr/grokfast)                                                                   | <https://arxiv.org/abs/2405.20233>                                                         | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation)                                                              |
+| Kate          | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad*                     | [github](https://github.com/nazya/KATE)                                                                        | <https://arxiv.org/abs/2403.02648>                                                         | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) |
+| StableAdamW   | *Stable and low-precision training for large-scale vision-language models*                        |                                                                                                                | <https://arxiv.org/abs/2304.13013>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230413013W/exportcitation)                                                        |
+| AdamMini      | *Use Fewer Learning Rates To Gain More*                                                           | [github](https://github.com/zyushun/Adam-mini)                                                                 | <https://arxiv.org/abs/2406.16793>                                                         | [cite](https://github.com/zyushun/Adam-mini?tab=readme-ov-file#citation)                                                            |
+| TRAC          | *Adaptive Parameter-free Optimization*                                                            | [github](https://github.com/ComputationalRobotics/TRAC)                                                        | <https://arxiv.org/abs/2405.16642>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240516642M/exportcitation)                                                        |
 
 ## Supported LR Scheduler
 
@@ -336,7 +337,7 @@ If you use this software, please cite it below. Or you can get it from "cite thi
         month = jan,
         title = {{pytorch_optimizer: optimizer & lr scheduler & loss function collections in PyTorch}},
         url = {https://github.com/kozistr/pytorch_optimizer},
-        version = {3.0.1},
+        version = {3.1.0},
         year = {2021}
     }
 
diff --git a/docs/changelogs/v3.1.1.md b/docs/changelogs/v3.1.1.md
new file mode 100644
index 000000000..07a3115df
--- /dev/null
+++ b/docs/changelogs/v3.1.1.md
@@ -0,0 +1,11 @@
+## Change Log
+
+### Feature
+
+* Implement `TRAC` optimizer. (#263)
+    * [Fast TRAC: A Parameter-Free Optimizer for Lifelong Reinforcement Learning](https://arxiv.org/abs/2405.16642)
+* Support `AdamW` optimizer via `create_optimizer()`. (#263)
+
+### Bug
+
+* Fix to handle the optimizers that only take the `model` instead of the parameters in `create_optimizer()`. (#263)
diff --git a/docs/index.md b/docs/index.md
index 89095bbe2..c06903f99 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -10,7 +10,7 @@
 
 **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. 
 I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas.  
-Currently, **73 optimizers (+ `bitsandbytes`, `qgalore`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
+Currently, **74 optimizers (+ `bitsandbytes`, `qgalore`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
 
 Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer).
 
@@ -94,83 +94,84 @@ from pytorch_optimizer import get_supported_optimizers
 supported_optimizers = get_supported_optimizers()
 ```
 
-| Optimizer     | Description                                                                                      | Official Code                                                                                                  | Paper                                                                                      | Citation                                                                                                                            |
-|---------------|--------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
-| AdaBelief     | *Adapting Step-sizes by the Belief in Observed Gradients*                                        | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer)                                                | <https://arxiv.org/abs/2010.07468>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation)                                                        |
-| AdaBound      | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate*                                  | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py)                                   | <https://openreview.net/forum?id=Bkg3g2R9FX>                                               | [cite](https://github.com/Luolc/AdaBound#citing)                                                                                    |
-| AdaHessian    | *An Adaptive Second Order Optimizer for Machine Learning*                                        | [github](https://github.com/amirgholami/adahessian)                                                            | <https://arxiv.org/abs/2006.00719>                                                         | [cite](https://github.com/amirgholami/adahessian#citation)                                                                          |
-| AdamD         | *Improved bias-correction in Adam*                                                               |                                                                                                                | <https://arxiv.org/abs/2110.10828>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation)                                                        |
-| AdamP         | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights*                   | [github](https://github.com/clovaai/AdamP)                                                                     | <https://arxiv.org/abs/2006.08217>                                                         | [cite](https://github.com/clovaai/AdamP#how-to-cite)                                                                                |
-| diffGrad      | *An Optimization Method for Convolutional Neural Networks*                                       | [github](https://github.com/shivram1987/diffGrad)                                                              | <https://arxiv.org/abs/1909.11015v3>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation)                                                        |
-| MADGRAD       | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic*                         | [github](https://github.com/facebookresearch/madgrad)                                                          | <https://arxiv.org/abs/2101.11075>                                                         | [cite](https://github.com/facebookresearch/madgrad#tech-report)                                                                     |
-| RAdam         | *On the Variance of the Adaptive Learning Rate and Beyond*                                       | [github](https://github.com/LiyuanLucasLiu/RAdam)                                                              | <https://arxiv.org/abs/1908.03265>                                                         | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation)                                                                            |
-| Ranger        | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer*             | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer)                                          | <https://bit.ly/3zyspC3>                                                                   | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work)                                                |
-| Ranger21      | *a synergistic deep learning optimizer*                                                          | [github](https://github.com/lessw2020/Ranger21)                                                                | <https://arxiv.org/abs/2106.13731>                                                         | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work)                                                                 |
-| Lamb          | *Large Batch Optimization for Deep Learning*                                                     | [github](https://github.com/cybertronai/pytorch-lamb)                                                          | <https://arxiv.org/abs/1904.00962>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation)                                                        |
-| Shampoo       | *Preconditioned Stochastic Tensor Optimization*                                                  | [github](https://github.com/moskomule/shampoo.pytorch)                                                         | <https://arxiv.org/abs/1802.09568>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation)                                                        |
-| Nero          | *Learning by Turning: Neural Architecture Aware Optimisation*                                    | [github](https://github.com/jxbz/nero)                                                                         | <https://arxiv.org/abs/2102.07227>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation)                                                        |
-| Adan          | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models*                         | [github](https://github.com/sail-sg/Adan)                                                                      | <https://arxiv.org/abs/2208.06677>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation)                                                        |
-| Adai          | *Disentangling the Effects of Adaptive Learning Rate and Momentum*                               | [github](https://github.com/zeke-xie/adaptive-inertia-adai)                                                    | <https://arxiv.org/abs/2006.15815>                                                         | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing)                                                                    |
-| SAM           | *Sharpness-Aware Minimization*                                                                   | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2010.01412>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation)                                                        |
-| ASAM          | *Adaptive Sharpness-Aware Minimization*                                                          | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2102.11600>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation)                                                        |
-| GSAM          | *Surrogate Gap Guided Sharpness-Aware Minimization*                                              | [github](https://github.com/juntang-zhuang/GSAM)                                                               | <https://openreview.net/pdf?id=edONMAnhLu->                                                | [cite](https://github.com/juntang-zhuang/GSAM#citation)                                                                             |
-| D-Adaptation  | *Learning-Rate-Free Learning by D-Adaptation*                                                    | [github](https://github.com/facebookresearch/dadaptation)                                                      | <https://arxiv.org/abs/2301.07733>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation)                                                        |
-| AdaFactor     | *Adaptive Learning Rates with Sublinear Memory Cost*                                             | [github](https://github.com/DeadAt0m/adafactor-pytorch)                                                        | <https://arxiv.org/abs/1804.04235>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation)                                                        |
-| Apollo        | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization*  | [github](https://github.com/XuezheMax/apollo)                                                                  | <https://arxiv.org/abs/2009.13586>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation)                                                        |
-| NovoGrad      | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks*     | [github](https://github.com/lonePatient/NovoGrad-pytorch)                                                      | <https://arxiv.org/abs/1905.11286>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation)                                                        |
-| Lion          | *Symbolic Discovery of Optimization Algorithms*                                                  | [github](https://github.com/google/automl/tree/master/lion)                                                    | <https://arxiv.org/abs/2302.06675>                                                         | [cite](https://github.com/google/automl/tree/master/lion#citation)                                                                  |
-| Ali-G         | *Adaptive Learning Rates for Interpolation with Gradients*                                       | [github](https://github.com/oval-group/ali-g)                                                                  | <https://arxiv.org/abs/1906.05661>                                                         | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients)                                |
-| SM3           | *Memory-Efficient Adaptive Optimization*                                                         | [github](https://github.com/google-research/google-research/tree/master/sm3)                                   | <https://arxiv.org/abs/1901.11150>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation)                                                        |
-| AdaNorm       | *Adaptive Gradient Norm Correction based Optimizer for CNNs*                                     | [github](https://github.com/shivram1987/AdaNorm)                                                               | <https://arxiv.org/abs/2210.06364>                                                         | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation)                                                                   |
-| RotoGrad      | *Gradient Homogenization in Multitask Learning*                                                  | [github](https://github.com/adrianjav/rotograd)                                                                | <https://openreview.net/pdf?id=T8wHz4rnuGL>                                                | [cite](https://github.com/adrianjav/rotograd#citing)                                                                                |
-| A2Grad        | *Optimal Adaptive and Accelerated Stochastic Gradient Descent*                                   | [github](https://github.com/severilov/A2Grad_optimizer)                                                        | <https://arxiv.org/abs/1810.00553>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation)                                                        |
-| AccSGD        | *Accelerating Stochastic Gradient Descent For Least Squares Regression*                          | [github](https://github.com/rahulkidambi/AccSGD)                                                               | <https://arxiv.org/abs/1704.08227>                                                         | [cite](https://github.com/rahulkidambi/AccSGD#citation)                                                                             |
-| SGDW          | *Decoupled Weight Decay Regularization*                                                          | [github](https://github.com/loshchil/AdamW-and-SGDW)                                                           | <https://arxiv.org/abs/1711.05101>                                                         | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact)                                                                          |
-| ASGD          | *Adaptive Gradient Descent without Descent*                                                      | [github](https://github.com/ymalitsky/adaptive_GD)                                                             | <https://arxiv.org/abs/1910.09529>                                                         | [cite](https://github.com/ymalitsky/adaptive_GD#reference)                                                                          |
-| Yogi          | *Adaptive Methods for Nonconvex Optimization*                                                    |                                                                                                                | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html)                   |
-| SWATS         | *Improving Generalization Performance by Switching from Adam to SGD*                             |                                                                                                                | <https://arxiv.org/abs/1712.07628>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation)                                                        |
-| Fromage       | *On the distance between two neural networks and the stability of learning*                      | [github](https://github.com/jxbz/fromage)                                                                      | <https://arxiv.org/abs/2002.03432>                                                         | [cite](https://github.com/jxbz/fromage#citation)                                                                                    |
-| MSVAG         | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients*                      | [github](https://github.com/lballes/msvag)                                                                     | <https://arxiv.org/abs/1705.07774>                                                         | [cite](https://github.com/lballes/msvag#citation)                                                                                   |
-| AdaMod        | *An Adaptive and Momental Bound Method for Stochastic Learning*                                  | [github](https://github.com/lancopku/AdaMod)                                                                   | <https://arxiv.org/abs/1910.12249>                                                         | [cite](https://github.com/lancopku/AdaMod#citation)                                                                                 |
-| AggMo         | *Aggregated Momentum: Stability Through Passive Damping*                                         | [github](https://github.com/AtheMathmo/AggMo)                                                                  | <https://arxiv.org/abs/1804.00325>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation)                                                        |
-| QHAdam        | *Quasi-hyperbolic momentum and Adam for deep learning*                                           | [github](https://github.com/facebookresearch/qhoptim)                                                          | <https://arxiv.org/abs/1810.06801>                                                         | [cite](https://github.com/facebookresearch/qhoptim#reference)                                                                       |
-| PID           | *A PID Controller Approach for Stochastic Optimization of Deep Networks*                         | [github](https://github.com/tensorboy/PIDOptimizer)                                                            | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf)                    | [cite](https://github.com/tensorboy/PIDOptimizer#citation)                                                                          |
-| Gravity       | *a Kinematic Approach on Optimization in Deep Learning*                                          | [github](https://github.com/dariush-bahrami/gravity.optimizer)                                                 | <https://arxiv.org/abs/2101.09192>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation)                                                        |
-| AdaSmooth     | *An Adaptive Learning Rate Method based on Effective Ratio*                                      |                                                                                                                | <https://arxiv.org/abs/2204.00825v1>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation)                                                        |
+| Optimizer     | Description                                                                                       | Official Code                                                                                                  | Paper                                                                                      | Citation                                                                                                                            |
+|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
+| AdaBelief     | *Adapting Step-sizes by the Belief in Observed Gradients*                                         | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer)                                                | <https://arxiv.org/abs/2010.07468>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation)                                                        |
+| AdaBound      | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate*                                   | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py)                                   | <https://openreview.net/forum?id=Bkg3g2R9FX>                                               | [cite](https://github.com/Luolc/AdaBound#citing)                                                                                    |
+| AdaHessian    | *An Adaptive Second Order Optimizer for Machine Learning*                                         | [github](https://github.com/amirgholami/adahessian)                                                            | <https://arxiv.org/abs/2006.00719>                                                         | [cite](https://github.com/amirgholami/adahessian#citation)                                                                          |
+| AdamD         | *Improved bias-correction in Adam*                                                                |                                                                                                                | <https://arxiv.org/abs/2110.10828>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation)                                                        |
+| AdamP         | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights*                    | [github](https://github.com/clovaai/AdamP)                                                                     | <https://arxiv.org/abs/2006.08217>                                                         | [cite](https://github.com/clovaai/AdamP#how-to-cite)                                                                                |
+| diffGrad      | *An Optimization Method for Convolutional Neural Networks*                                        | [github](https://github.com/shivram1987/diffGrad)                                                              | <https://arxiv.org/abs/1909.11015v3>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation)                                                        |
+| MADGRAD       | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic*                          | [github](https://github.com/facebookresearch/madgrad)                                                          | <https://arxiv.org/abs/2101.11075>                                                         | [cite](https://github.com/facebookresearch/madgrad#tech-report)                                                                     |
+| RAdam         | *On the Variance of the Adaptive Learning Rate and Beyond*                                        | [github](https://github.com/LiyuanLucasLiu/RAdam)                                                              | <https://arxiv.org/abs/1908.03265>                                                         | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation)                                                                            |
+| Ranger        | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer*              | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer)                                          | <https://bit.ly/3zyspC3>                                                                   | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work)                                                |
+| Ranger21      | *a synergistic deep learning optimizer*                                                           | [github](https://github.com/lessw2020/Ranger21)                                                                | <https://arxiv.org/abs/2106.13731>                                                         | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work)                                                                 |
+| Lamb          | *Large Batch Optimization for Deep Learning*                                                      | [github](https://github.com/cybertronai/pytorch-lamb)                                                          | <https://arxiv.org/abs/1904.00962>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation)                                                        |
+| Shampoo       | *Preconditioned Stochastic Tensor Optimization*                                                   | [github](https://github.com/moskomule/shampoo.pytorch)                                                         | <https://arxiv.org/abs/1802.09568>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation)                                                        |
+| Nero          | *Learning by Turning: Neural Architecture Aware Optimisation*                                     | [github](https://github.com/jxbz/nero)                                                                         | <https://arxiv.org/abs/2102.07227>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation)                                                        |
+| Adan          | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models*                          | [github](https://github.com/sail-sg/Adan)                                                                      | <https://arxiv.org/abs/2208.06677>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation)                                                        |
+| Adai          | *Disentangling the Effects of Adaptive Learning Rate and Momentum*                                | [github](https://github.com/zeke-xie/adaptive-inertia-adai)                                                    | <https://arxiv.org/abs/2006.15815>                                                         | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing)                                                                    |
+| SAM           | *Sharpness-Aware Minimization*                                                                    | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2010.01412>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation)                                                        |
+| ASAM          | *Adaptive Sharpness-Aware Minimization*                                                           | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2102.11600>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation)                                                        |
+| GSAM          | *Surrogate Gap Guided Sharpness-Aware Minimization*                                               | [github](https://github.com/juntang-zhuang/GSAM)                                                               | <https://openreview.net/pdf?id=edONMAnhLu->                                                | [cite](https://github.com/juntang-zhuang/GSAM#citation)                                                                             |
+| D-Adaptation  | *Learning-Rate-Free Learning by D-Adaptation*                                                     | [github](https://github.com/facebookresearch/dadaptation)                                                      | <https://arxiv.org/abs/2301.07733>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation)                                                        |
+| AdaFactor     | *Adaptive Learning Rates with Sublinear Memory Cost*                                              | [github](https://github.com/DeadAt0m/adafactor-pytorch)                                                        | <https://arxiv.org/abs/1804.04235>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation)                                                        |
+| Apollo        | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization*   | [github](https://github.com/XuezheMax/apollo)                                                                  | <https://arxiv.org/abs/2009.13586>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation)                                                        |
+| NovoGrad      | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks*      | [github](https://github.com/lonePatient/NovoGrad-pytorch)                                                      | <https://arxiv.org/abs/1905.11286>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation)                                                        |
+| Lion          | *Symbolic Discovery of Optimization Algorithms*                                                   | [github](https://github.com/google/automl/tree/master/lion)                                                    | <https://arxiv.org/abs/2302.06675>                                                         | [cite](https://github.com/google/automl/tree/master/lion#citation)                                                                  |
+| Ali-G         | *Adaptive Learning Rates for Interpolation with Gradients*                                        | [github](https://github.com/oval-group/ali-g)                                                                  | <https://arxiv.org/abs/1906.05661>                                                         | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients)                                |
+| SM3           | *Memory-Efficient Adaptive Optimization*                                                          | [github](https://github.com/google-research/google-research/tree/master/sm3)                                   | <https://arxiv.org/abs/1901.11150>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation)                                                        |
+| AdaNorm       | *Adaptive Gradient Norm Correction based Optimizer for CNNs*                                      | [github](https://github.com/shivram1987/AdaNorm)                                                               | <https://arxiv.org/abs/2210.06364>                                                         | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation)                                                                   |
+| RotoGrad      | *Gradient Homogenization in Multitask Learning*                                                   | [github](https://github.com/adrianjav/rotograd)                                                                | <https://openreview.net/pdf?id=T8wHz4rnuGL>                                                | [cite](https://github.com/adrianjav/rotograd#citing)                                                                                |
+| A2Grad        | *Optimal Adaptive and Accelerated Stochastic Gradient Descent*                                    | [github](https://github.com/severilov/A2Grad_optimizer)                                                        | <https://arxiv.org/abs/1810.00553>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation)                                                        |
+| AccSGD        | *Accelerating Stochastic Gradient Descent For Least Squares Regression*                           | [github](https://github.com/rahulkidambi/AccSGD)                                                               | <https://arxiv.org/abs/1704.08227>                                                         | [cite](https://github.com/rahulkidambi/AccSGD#citation)                                                                             |
+| SGDW          | *Decoupled Weight Decay Regularization*                                                           | [github](https://github.com/loshchil/AdamW-and-SGDW)                                                           | <https://arxiv.org/abs/1711.05101>                                                         | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact)                                                                          |
+| ASGD          | *Adaptive Gradient Descent without Descent*                                                       | [github](https://github.com/ymalitsky/adaptive_GD)                                                             | <https://arxiv.org/abs/1910.09529>                                                         | [cite](https://github.com/ymalitsky/adaptive_GD#reference)                                                                          |
+| Yogi          | *Adaptive Methods for Nonconvex Optimization*                                                     |                                                                                                                | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html)                   |
+| SWATS         | *Improving Generalization Performance by Switching from Adam to SGD*                              |                                                                                                                | <https://arxiv.org/abs/1712.07628>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation)                                                        |
+| Fromage       | *On the distance between two neural networks and the stability of learning*                       | [github](https://github.com/jxbz/fromage)                                                                      | <https://arxiv.org/abs/2002.03432>                                                         | [cite](https://github.com/jxbz/fromage#citation)                                                                                    |
+| MSVAG         | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients*                       | [github](https://github.com/lballes/msvag)                                                                     | <https://arxiv.org/abs/1705.07774>                                                         | [cite](https://github.com/lballes/msvag#citation)                                                                                   |
+| AdaMod        | *An Adaptive and Momental Bound Method for Stochastic Learning*                                   | [github](https://github.com/lancopku/AdaMod)                                                                   | <https://arxiv.org/abs/1910.12249>                                                         | [cite](https://github.com/lancopku/AdaMod#citation)                                                                                 |
+| AggMo         | *Aggregated Momentum: Stability Through Passive Damping*                                          | [github](https://github.com/AtheMathmo/AggMo)                                                                  | <https://arxiv.org/abs/1804.00325>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation)                                                        |
+| QHAdam        | *Quasi-hyperbolic momentum and Adam for deep learning*                                            | [github](https://github.com/facebookresearch/qhoptim)                                                          | <https://arxiv.org/abs/1810.06801>                                                         | [cite](https://github.com/facebookresearch/qhoptim#reference)                                                                       |
+| PID           | *A PID Controller Approach for Stochastic Optimization of Deep Networks*                          | [github](https://github.com/tensorboy/PIDOptimizer)                                                            | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf)                    | [cite](https://github.com/tensorboy/PIDOptimizer#citation)                                                                          |
+| Gravity       | *a Kinematic Approach on Optimization in Deep Learning*                                           | [github](https://github.com/dariush-bahrami/gravity.optimizer)                                                 | <https://arxiv.org/abs/2101.09192>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation)                                                        |
+| AdaSmooth     | *An Adaptive Learning Rate Method based on Effective Ratio*                                       |                                                                                                                | <https://arxiv.org/abs/2204.00825v1>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation)                                                        |
 | SRMM          | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM)                                                                   | <https://arxiv.org/abs/2201.01652>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation)                                                        |
-| AvaGrad       | *Domain-independent Dominance of Adaptive Methods*                                               | [github](https://github.com/lolemacs/avagrad)                                                                  | <https://arxiv.org/abs/1912.01823>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation)                                                        |
-| PCGrad        | *Gradient Surgery for Multi-Task Learning*                                                       | [github](https://github.com/tianheyu927/PCGrad)                                                                | <https://arxiv.org/abs/2001.06782>                                                         | [cite](https://github.com/tianheyu927/PCGrad#reference)                                                                             |
-| AMSGrad       | *On the Convergence of Adam and Beyond*                                                          |                                                                                                                | <https://openreview.net/pdf?id=ryQu7f-RZ>                                                  | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation)                                                        |
-| Lookahead     | *k steps forward, 1 step back*                                                                   | [github](https://github.com/pytorch/examples/tree/main/imagenet)                                               | <https://arxiv.org/abs/1907.08610>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation)                                                        |
-| PNM           | *Manipulating Stochastic Gradient Noise to Improve Generalization*                               | [github](https://github.com/zeke-xie/Positive-Negative-Momentum)                                               | <https://arxiv.org/abs/2103.17182>                                                         | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing)                                                               |
-| GC            | *Gradient Centralization*                                                                        | [github](https://github.com/Yonghongwei/Gradient-Centralization)                                               | <https://arxiv.org/abs/2004.01461>                                                         | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation)                                                             |
-| AGC           | *Adaptive Gradient Clipping*                                                                     | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets)                                     | <https://arxiv.org/abs/2102.06171>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation)                                                        |
-| Stable WD     | *Understanding and Scheduling Weight Decay*                                                      | [github](https://github.com/zeke-xie/stable-weight-decay-regularization)                                       | <https://arxiv.org/abs/2011.11152>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation)                                                        |
-| Softplus T    | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM*                          |                                                                                                                | <https://arxiv.org/abs/1908.00700>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation)                                                        |
-| Un-tuned w/u  | *On the adequacy of untuned warmup for adaptive optimization*                                    |                                                                                                                | <https://arxiv.org/abs/1910.04209>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation)                                                        |
-| Norm Loss     | *An efficient yet effective regularization method for deep neural networks*                      |                                                                                                                | <https://arxiv.org/abs/2103.06583>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation)                                                        |
-| AdaShift      | *Decorrelation and Convergence of Adaptive Learning Rate Methods*                                | [github](https://github.com/MichaelKonobeev/adashift)                                                          | <https://arxiv.org/abs/1810.00143v4>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation)                                                        |
-| AdaDelta      | *An Adaptive Learning Rate Method*                                                               |                                                                                                                | <https://arxiv.org/abs/1212.5701v1>                                                        | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation)                                                        |
-| Amos          | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale*                | [github](https://github.com/google-research/jestimator)                                                        | <https://arxiv.org/abs/2210.11693>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation)                                                        |
-| SignSGD       | *Compressed Optimisation for Non-Convex Problems*                                                | [github](https://github.com/jxbz/signSGD)                                                                      | <https://arxiv.org/abs/1802.04434>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation)                                                        |
-| Sophia        | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training*                   | [github](https://github.com/Liuhong99/Sophia)                                                                  | <https://arxiv.org/abs/2305.14342>                                                         | [cite](https://github.com/Liuhong99/Sophia)                                                                                         |
-| Prodigy       | *An Expeditiously Adaptive Parameter-Free Learner*                                               | [github](https://github.com/konstmish/prodigy)                                                                 | <https://arxiv.org/abs/2306.06101>                                                         | [cite](https://github.com/konstmish/prodigy#how-to-cite)                                                                            |
-| PAdam         | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks*   | [github](https://github.com/uclaml/Padam)                                                                      | <https://arxiv.org/abs/1806.06763>                                                         | [cite](https://github.com/uclaml/Padam#citation)                                                                                    |
-| LOMO          | *Full Parameter Fine-tuning for Large Language Models with Limited Resources*                    | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2306.09782>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
-| AdaLOMO       | *Low-memory Optimization with Adaptive Learning Rate*                     | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2310.10195>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
-| Tiger         | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious*                      | [github](https://github.com/bojone/tiger)                                                                      |                                                                                            | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation)                                                             |
-| CAME          | *Confidence-guided Adaptive Memory Efficient Optimization*                                       | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME)                            | <https://aclanthology.org/2023.acl-long.243/>                                              | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation)                                          |
-| WSAM          | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term*            | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | <https://arxiv.org/abs/2305.15817>                                                         | [cite](https://github.com/intelligent-machine-learning/dlrover)                                                                     |
-| Aida          | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range*     | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer)                                                   | <https://arxiv.org/abs/2203.13273>                                                         | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida)                           |
-| GaLore        | *Memory-Efficient LLM Training by Gradient Low-Rank Projection*                                  | [github](https://github.com/jiaweizzhao/GaLore)                                                                | <https://arxiv.org/abs/2403.03507>                                                         | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation)                                               |
-| Adalite       | *Adalite optimizer*                                                                              | [github](https://github.com/VatsaDev/adalite)                                                                  | <https://github.com/VatsaDev/adalite>                                                      | [cite](https://github.com/VatsaDev/adalite)                                                                                         |
-| bSAM          | *SAM as an Optimal Relaxation of Bayes*                                                          | [github](https://github.com/team-approx-bayes/bayesian-sam)                                                    | <https://arxiv.org/abs/2210.01620>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation)                                                        |
-| Schedule-Free | *Schedule-Free Optimizers*                                                                       | [github](https://github.com/facebookresearch/schedule_free)                                                    | <https://github.com/facebookresearch/schedule_free>                                        | [cite](https://github.com/facebookresearch/schedule_free)                                                                           |
-| FAdam         | *Adam is a natural gradient optimizer using diagonal empirical Fisher information*               | [github](https://github.com/lessw2020/fadam_pytorch)                                                           | <https://arxiv.org/abs/2405.12807>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation)                                                        |
-| Grokfast      | *Accelerated Grokking by Amplifying Slow Gradients*                                              | [github](https://github.com/ironjr/grokfast)                                                                   | <https://arxiv.org/abs/2405.20233>                                                         | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation)                                                              |
-| Kate          | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad*                    | [github](https://github.com/nazya/KATE)                                                                        | <https://arxiv.org/abs/2403.02648>                                                         | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) |
-| StableAdamW   | *Stable and low-precision training for large-scale vision-language models*                       |                                                                                                                | <https://arxiv.org/abs/2304.13013>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230413013W/exportcitation)                                                        |
-| AdamMini      | *Use Fewer Learning Rates To Gain More*                                                          | [github](https://github.com/zyushun/Adam-mini)                                                                 | <https://arxiv.org/abs/2406.16793>                                                         | [cite](https://github.com/zyushun/Adam-mini?tab=readme-ov-file#citation)                                                            |
+| AvaGrad       | *Domain-independent Dominance of Adaptive Methods*                                                | [github](https://github.com/lolemacs/avagrad)                                                                  | <https://arxiv.org/abs/1912.01823>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation)                                                        |
+| PCGrad        | *Gradient Surgery for Multi-Task Learning*                                                        | [github](https://github.com/tianheyu927/PCGrad)                                                                | <https://arxiv.org/abs/2001.06782>                                                         | [cite](https://github.com/tianheyu927/PCGrad#reference)                                                                             |
+| AMSGrad       | *On the Convergence of Adam and Beyond*                                                           |                                                                                                                | <https://openreview.net/pdf?id=ryQu7f-RZ>                                                  | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation)                                                        |
+| Lookahead     | *k steps forward, 1 step back*                                                                    | [github](https://github.com/pytorch/examples/tree/main/imagenet)                                               | <https://arxiv.org/abs/1907.08610>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation)                                                        |
+| PNM           | *Manipulating Stochastic Gradient Noise to Improve Generalization*                                | [github](https://github.com/zeke-xie/Positive-Negative-Momentum)                                               | <https://arxiv.org/abs/2103.17182>                                                         | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing)                                                               |
+| GC            | *Gradient Centralization*                                                                         | [github](https://github.com/Yonghongwei/Gradient-Centralization)                                               | <https://arxiv.org/abs/2004.01461>                                                         | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation)                                                             |
+| AGC           | *Adaptive Gradient Clipping*                                                                      | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets)                                     | <https://arxiv.org/abs/2102.06171>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation)                                                        |
+| Stable WD     | *Understanding and Scheduling Weight Decay*                                                       | [github](https://github.com/zeke-xie/stable-weight-decay-regularization)                                       | <https://arxiv.org/abs/2011.11152>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation)                                                        |
+| Softplus T    | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM*                           |                                                                                                                | <https://arxiv.org/abs/1908.00700>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation)                                                        |
+| Un-tuned w/u  | *On the adequacy of untuned warmup for adaptive optimization*                                     |                                                                                                                | <https://arxiv.org/abs/1910.04209>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation)                                                        |
+| Norm Loss     | *An efficient yet effective regularization method for deep neural networks*                       |                                                                                                                | <https://arxiv.org/abs/2103.06583>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation)                                                        |
+| AdaShift      | *Decorrelation and Convergence of Adaptive Learning Rate Methods*                                 | [github](https://github.com/MichaelKonobeev/adashift)                                                          | <https://arxiv.org/abs/1810.00143v4>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation)                                                        |
+| AdaDelta      | *An Adaptive Learning Rate Method*                                                                |                                                                                                                | <https://arxiv.org/abs/1212.5701v1>                                                        | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation)                                                        |
+| Amos          | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale*                 | [github](https://github.com/google-research/jestimator)                                                        | <https://arxiv.org/abs/2210.11693>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation)                                                        |
+| SignSGD       | *Compressed Optimisation for Non-Convex Problems*                                                 | [github](https://github.com/jxbz/signSGD)                                                                      | <https://arxiv.org/abs/1802.04434>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation)                                                        |
+| Sophia        | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training*                    | [github](https://github.com/Liuhong99/Sophia)                                                                  | <https://arxiv.org/abs/2305.14342>                                                         | [cite](https://github.com/Liuhong99/Sophia)                                                                                         |
+| Prodigy       | *An Expeditiously Adaptive Parameter-Free Learner*                                                | [github](https://github.com/konstmish/prodigy)                                                                 | <https://arxiv.org/abs/2306.06101>                                                         | [cite](https://github.com/konstmish/prodigy#how-to-cite)                                                                            |
+| PAdam         | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks*    | [github](https://github.com/uclaml/Padam)                                                                      | <https://arxiv.org/abs/1806.06763>                                                         | [cite](https://github.com/uclaml/Padam#citation)                                                                                    |
+| LOMO          | *Full Parameter Fine-tuning for Large Language Models with Limited Resources*                     | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2306.09782>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
+| AdaLOMO       | *Low-memory Optimization with Adaptive Learning Rate*                                             | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2310.10195>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
+| Tiger         | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious*                       | [github](https://github.com/bojone/tiger)                                                                      |                                                                                            | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation)                                                             |
+| CAME          | *Confidence-guided Adaptive Memory Efficient Optimization*                                        | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME)                            | <https://aclanthology.org/2023.acl-long.243/>                                              | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation)                                          |
+| WSAM          | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term*             | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | <https://arxiv.org/abs/2305.15817>                                                         | [cite](https://github.com/intelligent-machine-learning/dlrover)                                                                     |
+| Aida          | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range*      | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer)                                                   | <https://arxiv.org/abs/2203.13273>                                                         | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida)                           |
+| GaLore        | *Memory-Efficient LLM Training by Gradient Low-Rank Projection*                                   | [github](https://github.com/jiaweizzhao/GaLore)                                                                | <https://arxiv.org/abs/2403.03507>                                                         | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation)                                               |
+| Adalite       | *Adalite optimizer*                                                                               | [github](https://github.com/VatsaDev/adalite)                                                                  | <https://github.com/VatsaDev/adalite>                                                      | [cite](https://github.com/VatsaDev/adalite)                                                                                         |
+| bSAM          | *SAM as an Optimal Relaxation of Bayes*                                                           | [github](https://github.com/team-approx-bayes/bayesian-sam)                                                    | <https://arxiv.org/abs/2210.01620>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation)                                                        |
+| Schedule-Free | *Schedule-Free Optimizers*                                                                        | [github](https://github.com/facebookresearch/schedule_free)                                                    | <https://github.com/facebookresearch/schedule_free>                                        | [cite](https://github.com/facebookresearch/schedule_free)                                                                           |
+| FAdam         | *Adam is a natural gradient optimizer using diagonal empirical Fisher information*                | [github](https://github.com/lessw2020/fadam_pytorch)                                                           | <https://arxiv.org/abs/2405.12807>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation)                                                        |
+| Grokfast      | *Accelerated Grokking by Amplifying Slow Gradients*                                               | [github](https://github.com/ironjr/grokfast)                                                                   | <https://arxiv.org/abs/2405.20233>                                                         | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation)                                                              |
+| Kate          | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad*                     | [github](https://github.com/nazya/KATE)                                                                        | <https://arxiv.org/abs/2403.02648>                                                         | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) |
+| StableAdamW   | *Stable and low-precision training for large-scale vision-language models*                        |                                                                                                                | <https://arxiv.org/abs/2304.13013>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230413013W/exportcitation)                                                        |
+| AdamMini      | *Use Fewer Learning Rates To Gain More*                                                           | [github](https://github.com/zyushun/Adam-mini)                                                                 | <https://arxiv.org/abs/2406.16793>                                                         | [cite](https://github.com/zyushun/Adam-mini?tab=readme-ov-file#citation)                                                            |
+| TRAC          | *Adaptive Parameter-free Optimization*                                                            | [github](https://github.com/ComputationalRobotics/TRAC)                                                        | <https://arxiv.org/abs/2405.16642>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240516642M/exportcitation)                                                        |
 
 ## Supported LR Scheduler
 
@@ -336,7 +337,7 @@ If you use this software, please cite it below. Or you can get it from "cite thi
         month = jan,
         title = {{pytorch_optimizer: optimizer & lr scheduler & loss function collections in PyTorch}},
         url = {https://github.com/kozistr/pytorch_optimizer},
-        version = {3.0.1},
+        version = {3.1.0},
         year = {2021}
     }
 
diff --git a/docs/optimizer.md b/docs/optimizer.md
index bd4f4e6ac..0fb3628fc 100644
--- a/docs/optimizer.md
+++ b/docs/optimizer.md
@@ -328,6 +328,10 @@
     :docstring:
     :members:
 
+::: pytorch_optimizer.TRAC
+    :docstring:
+    :members:
+
 ::: pytorch_optimizer.WSAM
     :docstring:
     :members:
diff --git a/pyproject.toml b/pyproject.toml
index 0242aaf7c..cdef27c6f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ keywords = [
     "GaLore", "Gravity", "GrokFast", "GSAM", "Kate", "Lamb", "LARS", "Lion", "LOMO", "Lookahead", "MADGRAD", "MSVAG",
     "Nero", "NovoGrad", "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21",
     "RotoGrad", "SAM", "ScheduleFreeSGD", "ScheduleFreeAdamW", "SGDP", "Shampoo", "ScalableShampoo", "SGDW", "SignSGD",
-    "SM3", "SopihaH", "SRMM", "StableAdamW", "SWATS", "Tiger", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal",
+    "SM3", "SopihaH", "SRMM", "StableAdamW", "SWATS", "Tiger", "TRAC", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal",
     "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard", "Bi-Tempered", "Tversky", "FocalTversky", "LovaszHinge",
     "bitsandbytes", "WSD", "QGaLore",
 ]
diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py
index 3628bb74e..25241f9f4 100644
--- a/pytorch_optimizer/__init__.py
+++ b/pytorch_optimizer/__init__.py
@@ -4,6 +4,7 @@
 
 import torch.cuda
 from torch import nn
+from torch.optim import AdamW
 
 from pytorch_optimizer.base.types import OPTIMIZER, PARAMETERS, SCHEDULER
 from pytorch_optimizer.loss.bi_tempered import BinaryBiTemperedLogisticLoss, BiTemperedLogisticLoss
@@ -115,6 +116,7 @@
 from pytorch_optimizer.optimizer.srmm import SRMM
 from pytorch_optimizer.optimizer.swats import SWATS
 from pytorch_optimizer.optimizer.tiger import Tiger
+from pytorch_optimizer.optimizer.trac import TRAC
 from pytorch_optimizer.optimizer.utils import (
     clip_grad_norm,
     disable_running_stats,
@@ -131,6 +133,7 @@
 HAS_Q_GALORE: bool = find_spec('q-galore-torch') is not None
 
 OPTIMIZER_LIST: List[OPTIMIZER] = [
+    AdamW,
     AdaBelief,
     AdaBound,
     PID,
@@ -350,6 +353,8 @@ def create_optimizer(
 
     if optimizer_name == 'alig':
         optimizer = optimizer(parameters, max_lr=lr, **kwargs)
+    elif optimizer_name in {'lomo', 'adalomo', 'adammini'}:
+        optimizer = optimizer(model, lr=lr, **kwargs)
     else:
         optimizer = optimizer(parameters, lr=lr, **kwargs)
 
diff --git a/pytorch_optimizer/optimizer/lookahead.py b/pytorch_optimizer/optimizer/lookahead.py
index 44a0be2ed..2291d50a2 100644
--- a/pytorch_optimizer/optimizer/lookahead.py
+++ b/pytorch_optimizer/optimizer/lookahead.py
@@ -22,7 +22,7 @@ def __init__(
         k: int = 5,
         alpha: float = 0.5,
         pullback_momentum: str = 'none',
-    ):
+    ) -> None:
         self.validate_positive(k, 'k')
         self.validate_range(alpha, 'alpha', 0.0, 1.0)
         self.validate_options(pullback_momentum, 'pullback_momentum', ['none', 'reset', 'pullback'])
diff --git a/pytorch_optimizer/optimizer/trac.py b/pytorch_optimizer/optimizer/trac.py
new file mode 100644
index 000000000..7dbc98e5a
--- /dev/null
+++ b/pytorch_optimizer/optimizer/trac.py
@@ -0,0 +1,253 @@
+from typing import Callable, Dict, List, Tuple
+
+import torch
+from torch import nn
+
+from pytorch_optimizer.base.optimizer import BaseOptimizer
+from pytorch_optimizer.base.types import CLOSURE, DEFAULTS, LOSS, OPTIMIZER
+
+
+def polyval(x: torch.Tensor, coef: torch.Tensor) -> torch.Tensor:
+    r"""Implement of the Horner scheme to evaluate a polynomial.
+
+    taken from https://discuss.pytorch.org/t/polynomial-evaluation-by-horner-rule/67124
+
+    :param x: torch.Tensor. variable.
+    :param coef: torch.Tensor. coefficients of the polynomial.
+    """
+    result = coef[0].clone()
+
+    for c in coef[1:]:
+        result = (result * x) + c
+
+    return result[0]
+
+
+class ERF1994(nn.Module):
+    r"""Implementation of ERF1994.
+
+    :param num_coefs: int. The number of polynomial coefficients to use in the approximation.
+    """
+
+    def __init__(self, num_coefs: int = 128) -> None:
+        super().__init__()
+
+        self.n: int = num_coefs
+
+        self.i: torch.Tensor = torch.complex(torch.tensor(0.0), torch.tensor(1.0))
+        self.m = 2 * self.n
+        self.m2 = 2 * self.m
+        self.k = torch.linspace(-self.m + 1, self.m - 1, self.m2 - 1)
+        self.l = torch.sqrt(self.n / torch.sqrt(torch.tensor(2.0)))
+        self.theta = self.k * torch.pi / self.m
+        self.t = self.l * torch.tan(self.theta / 2.0)
+        self.f = torch.exp(-self.t ** 2) * (self.l ** 2 + self.t ** 2)  # fmt: skip
+        self.a = torch.fft.fft(torch.fft.fftshift(self.f)).real / self.m2
+        self.a = torch.flipud(self.a[1:self.n + 1])  # fmt: skip
+
+    def w_algorithm(self, z: torch.Tensor) -> torch.Tensor:
+        r"""Compute the Faddeeva function of a complex number.
+
+        :param z: torch.Tensor. A tensor of complex numbers.
+        """
+        self.l = self.l.to(z.device)
+        self.i = self.i.to(z.device)
+        self.a = self.a.to(z.device)
+
+        iz = self.i * z
+        lp_iz, ln_iz = self.l + iz, self.l - iz
+
+        z_ = lp_iz / ln_iz
+        p = polyval(z_.unsqueeze(0), self.a)
+        return 2 * p / ln_iz.pow(2) + (1.0 / torch.sqrt(torch.tensor(torch.pi))) / ln_iz
+
+    def forward(self, z: torch.Tensor) -> torch.Tensor:
+        r"""Compute the error function of a complex number.
+
+        :param z: torch.Tensor. A tensor of complex numbers.
+        """
+        sign_r = torch.sign(z.real)
+        sign_i = torch.sign(z.imag)
+        z = torch.complex(torch.abs(z.real), torch.abs(z.imag))
+        out = -torch.exp(torch.log(self.w_algorithm(z * self.i)) - z ** 2) + 1  # fmt: skip
+        return torch.complex(out.real * sign_r, out.imag * sign_i)
+
+
+class TRAC(BaseOptimizer):
+    r"""A Parameter-Free Optimizer for Lifelong Reinforcement Learning.
+
+    Example:
+    -------
+        Here's an example::
+
+            model = YourModel()
+            optimizer = TRAC(AdamW(model.parameters()))
+
+            for input, output in data:
+                optimizer.zero_grad()
+
+                loss = loss_fn(model(input), output)
+                loss.backward()
+
+                optimizer.step()
+
+    :param optimizer: Optimizer. base optimizer.
+    :param betas: List[float]. list of beta values.
+    :param num_coefs: int. the number of polynomial coefficients to use in the approximation.
+    :param s_prev: float. initial scale value.
+    :param eps: float. term added to the denominator to improve numerical stability.
+    """
+
+    def __init__(
+        self,
+        optimizer: OPTIMIZER,
+        betas: List[float] = (0.9, 0.99, 0.999, 0.9999, 0.99999, 0.999999),
+        num_coefs: int = 128,
+        s_prev: float = 1e-8,
+        eps: float = 1e-8,
+    ):
+        self.validate_positive(num_coefs, 'num_coefs')
+        self.validate_non_negative(s_prev, 's_prev')
+        self.validate_non_negative(eps, 'eps')
+
+        self._optimizer_step_pre_hooks: Dict[int, Callable] = {}
+        self._optimizer_step_post_hooks: Dict[int, Callable] = {}
+
+        self.erf = ERF1994(num_coefs=num_coefs)
+        self.betas = betas
+        self.s_prev = s_prev
+        self.eps = eps
+
+        self.f_term = self.s_prev / self.erf_imag(1.0 / torch.sqrt(torch.tensor(2.0)))
+
+        self.optimizer = optimizer
+        self.defaults: DEFAULTS = optimizer.defaults
+
+    def __str__(self) -> str:
+        return 'TRAC'
+
+    @property
+    def param_groups(self):
+        return self.optimizer.param_groups
+
+    @property
+    def state(self):
+        return self.optimizer.state
+
+    @torch.no_grad()
+    def reset(self):
+        device = self.param_groups[0]['params'][0].device
+
+        self.state['trac'] = {
+            'betas': torch.tensor(self.betas, device=device),
+            's': torch.zeros(len(self.betas), device=device),
+            'variance': torch.zeros(len(self.betas), device=device),
+            'sigma': torch.full((len(self.betas),), 1e-8, device=device),
+            'step': 0,
+        }
+
+        for group in self.param_groups:
+            for p in group['params']:
+                self.state['trac'][p] = p.clone()
+
+    @torch.no_grad()
+    def zero_grad(self) -> None:
+        self.optimizer.zero_grad(set_to_none=True)
+
+    @torch.no_grad()
+    def erf_imag(self, x: torch.Tensor) -> torch.Tensor:
+        if not torch.is_floating_point(x):
+            x = x.to(torch.float32)
+
+        ix = torch.complex(torch.zeros_like(x), x)
+
+        return self.erf(ix).imag
+
+    @torch.no_grad()
+    def backup_params_and_grads(self) -> Tuple[Dict, Dict]:
+        updates, grads = {}, {}
+
+        for group in self.param_groups:
+            for p in group['params']:
+                updates[p] = p.clone()
+                grads[p] = p.grad.clone() if p.grad is not None else None
+
+        return updates, grads
+
+    @torch.no_grad()
+    def trac_step(self, updates: Dict, grads: Dict) -> None:
+        self.state['trac']['step'] += 1
+
+        deltas = {}
+
+        device = self.param_groups[0]['params'][0].device
+
+        h = torch.zeros((1,), device=device)
+        for group in self.param_groups:
+            for p in group['params']:
+                if grads[p] is None:
+                    continue
+
+                theta_ref = self.state['trac'][p]
+                update = updates[p]
+
+                deltas[p] = (update - theta_ref) / torch.sum(self.state['trac']['s']).add_(self.eps)
+                update.neg_().add_(p)
+
+                grad, delta = grads[p], deltas[p]
+
+                product = torch.dot(delta.flatten(), grad.flatten())
+                h.add_(product)
+
+                delta.add_(update)
+
+        s = self.state['trac']['s']
+        betas = self.state['trac']['betas']
+        variance = self.state['trac']['variance']
+        sigma = self.state['trac']['sigma']
+
+        variance.mul_(betas.pow(2)).add_(h.pow(2))
+        sigma.mul_(betas).sub_(h)
+
+        s_term = self.erf_imag(sigma / (2.0 * variance).sqrt_().add_(self.eps))
+        s_term.mul_(self.f_term)
+        s.copy_(s_term)
+
+        scale = max(torch.sum(s), 0.0)
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if grads[p] is None:
+                    continue
+
+                delta = deltas[p]
+                delta.mul_(scale).add_(self.state['trac'][p])
+
+                p.copy_(delta)
+
+    @torch.no_grad()
+    def step(self, closure: CLOSURE = None) -> LOSS:
+        # TODO: backup is first to get the delta of param and grad, but it does not work.
+        with torch.enable_grad():
+            loss = self.optimizer.step(closure)
+
+        updates, grads = self.backup_params_and_grads()
+
+        if 'trac' not in self.state:
+            device = self.param_groups[0]['params'][0].device
+
+            self.state['trac'] = {
+                'betas': torch.tensor(self.betas, device=device),
+                's': torch.zeros(len(self.betas), device=device),
+                'variance': torch.zeros(len(self.betas), device=device),
+                'sigma': torch.full((len(self.betas),), 1e-8, device=device),
+                'step': 0,
+            }
+
+            for group in self.param_groups:
+                for p in group['params']:
+                    self.state['trac'][p] = updates[p].clone()
+
+        self.trac_step(updates, grads)
+
+        return loss
diff --git a/tests/constants.py b/tests/constants.py
index 498fbf46b..072645cee 100644
--- a/tests/constants.py
+++ b/tests/constants.py
@@ -85,6 +85,7 @@
     'wsam',
     'pcgrad',
     'lookahead',
+    'trac',
 ]
 
 SPARSE_OPTIMIZERS: List[str] = ['madgrad', 'dadaptadagrad', 'sm3']
diff --git a/tests/test_create_optimizer.py b/tests/test_create_optimizer.py
index 56d9c5914..93a3a0e16 100644
--- a/tests/test_create_optimizer.py
+++ b/tests/test_create_optimizer.py
@@ -9,6 +9,7 @@ def test_create_optimizer():
 
     create_optimizer(model, 'adamp', lr=1e-2, weight_decay=1e-3, use_gc=True, use_lookahead=True)
     create_optimizer(model, 'alig', lr=1e-2, use_lookahead=True)
+    create_optimizer(model, 'adalomo', lr=1e-2, use_lookahead=False)
 
 
 def test_bnb_optimizer():
diff --git a/tests/test_general_optimizer_parameters.py b/tests/test_general_optimizer_parameters.py
index 800b6e839..752615137 100644
--- a/tests/test_general_optimizer_parameters.py
+++ b/tests/test_general_optimizer_parameters.py
@@ -8,7 +8,7 @@
 
 @pytest.mark.parametrize('optimizer_name', VALID_OPTIMIZER_NAMES)
 def test_learning_rate(optimizer_name):
-    if optimizer_name in ('alig', 'a2grad'):
+    if optimizer_name in {'alig', 'a2grad', 'adamw'}:
         pytest.skip(f'skip {optimizer_name} optimizer')
 
     optimizer = load_optimizer(optimizer_name)
diff --git a/tests/test_gradients.py b/tests/test_gradients.py
index 76e5a301f..14dd8be68 100644
--- a/tests/test_gradients.py
+++ b/tests/test_gradients.py
@@ -1,13 +1,13 @@
 import pytest
 import torch
 
-from pytorch_optimizer import SAM, WSAM, AdamP, Lookahead, load_optimizer
+from pytorch_optimizer import SAM, TRAC, WSAM, AdamP, Lookahead, load_optimizer
 from pytorch_optimizer.base.exception import NoSparseGradientError
 from tests.constants import NO_SPARSE_OPTIMIZERS, SPARSE_OPTIMIZERS, VALID_OPTIMIZER_NAMES
 from tests.utils import build_environment, simple_parameter, simple_sparse_parameter, sphere_loss
 
 
-@pytest.mark.parametrize('optimizer_name', [*VALID_OPTIMIZER_NAMES, 'lookahead'])
+@pytest.mark.parametrize('optimizer_name', [*VALID_OPTIMIZER_NAMES, 'lookahead', 'trac'])
 def test_no_gradients(optimizer_name):
     if optimizer_name in {'lomo', 'adalomo', 'adammini'}:
         pytest.skip(f'skip {optimizer_name} optimizer.')
@@ -25,7 +25,9 @@ def test_no_gradients(optimizer_name):
     elif optimizer_name in ('lamb', 'ralamb'):
         optimizer = load_optimizer(optimizer_name)(params, pre_norm=True)
     elif optimizer_name == 'lookahead':
-        optimizer = Lookahead(load_optimizer('adamp')(params), k=1)
+        optimizer = Lookahead(load_optimizer('adamw')(params), k=1)
+    elif optimizer_name == 'trac':
+        optimizer = TRAC(load_optimizer('adamw')(params))
     else:
         optimizer = load_optimizer(optimizer_name)(params)
 
@@ -33,13 +35,13 @@ def test_no_gradients(optimizer_name):
     sphere_loss(p1 + p3).backward(create_graph=True)
 
     optimizer.step(lambda: 0.1)  # for AliG optimizer
-    if optimizer_name != 'lookahead':
+    if optimizer_name not in {'lookahead', 'trac'}:
         optimizer.zero_grad(set_to_none=True)
 
 
 @pytest.mark.parametrize('no_sparse_optimizer', NO_SPARSE_OPTIMIZERS)
 def test_sparse_not_supported(no_sparse_optimizer):
-    if no_sparse_optimizer in {'lomo', 'adalomo', 'bsam', 'adammini'}:
+    if no_sparse_optimizer in {'lomo', 'adalomo', 'bsam', 'adammini', 'adamw'}:
         pytest.skip(f'skip {no_sparse_optimizer} optimizer.')
 
     param = simple_sparse_parameter()[1]
diff --git a/tests/test_load_modules.py b/tests/test_load_modules.py
index 8b5fc4d0d..9a23043ca 100644
--- a/tests/test_load_modules.py
+++ b/tests/test_load_modules.py
@@ -38,7 +38,7 @@ def test_load_lr_scheduler_invalid(invalid_lr_scheduler_names):
 
 
 def test_get_supported_optimizers():
-    assert len(get_supported_optimizers()) == 72
+    assert len(get_supported_optimizers()) == 73
 
 
 def test_get_supported_lr_schedulers():
diff --git a/tests/test_optimizer_parameters.py b/tests/test_optimizer_parameters.py
index 433b3d988..c83e11cbe 100644
--- a/tests/test_optimizer_parameters.py
+++ b/tests/test_optimizer_parameters.py
@@ -52,7 +52,7 @@ def test_came_parameters():
 
 
 def test_pcgrad_parameters():
-    opt = load_optimizer('adamp')([simple_parameter()])
+    opt = load_optimizer('adamw')([simple_parameter()])
 
     # test reduction
     for reduction in ['mean', 'sum']:
@@ -64,16 +64,16 @@ def test_pcgrad_parameters():
 
 def test_sam_parameters():
     with pytest.raises(ValueError):
-        SAM(None, load_optimizer('adamp'), rho=-0.1)
+        SAM(None, load_optimizer('adamw'), rho=-0.1)
 
 
 def test_wsam_parameters():
     with pytest.raises(ValueError):
-        WSAM(None, None, load_optimizer('adamp'), rho=-0.1)
+        WSAM(None, None, load_optimizer('adamw'), rho=-0.1)
 
 
 def test_lookahead_parameters():
-    optimizer = load_optimizer('adamp')([simple_parameter()])
+    optimizer = load_optimizer('adamw')([simple_parameter()])
 
     for pullback_momentum in PULLBACK_MOMENTUM:
         opt = Lookahead(optimizer, pullback_momentum=pullback_momentum)
diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py
index 6501de3ee..aaf9024e1 100644
--- a/tests/test_optimizers.py
+++ b/tests/test_optimizers.py
@@ -7,6 +7,7 @@
     BSAM,
     GSAM,
     SAM,
+    TRAC,
     WSAM,
     CosineScheduler,
     DynamicLossScaler,
@@ -667,3 +668,37 @@ def test_adam_mini_optimizer(environment):
     optimizer = load_optimizer('AdamMini')(model)
     optimizer.reset()
     optimizer.step()
+
+
+def test_trac_optimizer(environment):
+    (x_data, y_data), model, loss_fn = environment
+
+    optimizer = TRAC(load_optimizer('adamw')(model.parameters(), lr=1e0))
+
+    init_loss, loss = np.inf, np.inf
+    for _ in range(3):
+        loss = loss_fn(model(x_data), y_data)
+
+        if init_loss == np.inf:
+            init_loss = loss
+
+        loss.backward()
+
+        optimizer.step()
+        optimizer.zero_grad()
+
+    assert tensor_to_numpy(init_loss) > 2.0 * tensor_to_numpy(loss)
+
+
+def test_trac_optimizer_erf_imag():
+    model = Example()
+
+    optimizer = TRAC(load_optimizer('adamw')(model.parameters()))
+
+    optimizer.reset()
+    optimizer.zero_grad()
+
+    complex_tensor = torch.complex(torch.tensor(0.0), torch.tensor(1.0))
+    optimizer.erf_imag(complex_tensor)
+
+    assert str(optimizer).lower() == 'trac'