Merge pull request #301 from kozistr/feature/demo-optimizer

[Feature] Implement DeMo optimizer
kozistr · Dec 2, 2024 · c341872 · c341872
2 parents 0768ab2 + 202fd2f
commit c341872
Show file tree

Hide file tree

Showing 10 changed files with 527 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@
 
 **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. 
 I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas.  
-Currently, **79 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
+Currently, **80 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
 
 Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer).
 
@@ -184,6 +184,8 @@ get_supported_optimizers(['adam*', 'ranger*'])
 | ADOPT         | *Modified Adam Can Converge with Any β2 with the Optimal Rate*                                    | [github](https://github.com/iShohei220/adopt)                                                                  | <https://arxiv.org/abs/2411.02853>                                                          | [cite](https://github.com/iShohei220/adopt?tab=readme-ov-file#citation)                                                             |
 | FTRL          | *Follow The Regularized Leader*                                                                   |                                                                                                                | <https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41159.pdf> |                                                                                                                                     |
 | Cautious      | *Improving Training with One Line of Code*                                                        | [github](https://github.com/kyleliang919/C-Optim)                                                              | <https://arxiv.org/pdf/2411.16085v1>                                                        | [cite](https://github.com/kyleliang919/C-Optim?tab=readme-ov-file#citation)                                                         |
+| DeMo          | *Decoupled Momentum Optimization*                                                                 | [github](https://github.com/bloc97/DeMo)                                                                       | <https://arxiv.org/abs/2411.19870>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv241119870P/exportcitation)                                                        |
+| MicroAdam     | *Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence*                 | [github](https://github.com/IST-DASLab/MicroAdam)                                                              | <https://arxiv.org/abs/2405.15593>                                                          | [cite](https://github.com/IST-DASLab/MicroAdam?tab=readme-ov-file#citing)                                                           |
 
 ## Supported LR Scheduler
 

diff --git a/docs/changelogs/v3.3.1.md b/docs/changelogs/v3.3.1.md
@@ -0,0 +1,6 @@
+### Change Log
+
+### Feature
+
+* Implement `DeMo` optimizer. (#300, #301)
+    * [Decoupled Momentum Optimization](https://arxiv.org/abs/2411.19870)
diff --git a/docs/index.md b/docs/index.md
@@ -10,7 +10,7 @@
 
 **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. 
 I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas.  
-Currently, **79 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
+Currently, **80 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
 
 Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer).
 
@@ -184,6 +184,8 @@ get_supported_optimizers(['adam*', 'ranger*'])
 | ADOPT         | *Modified Adam Can Converge with Any β2 with the Optimal Rate*                                    | [github](https://github.com/iShohei220/adopt)                                                                  | <https://arxiv.org/abs/2411.02853>                                                          | [cite](https://github.com/iShohei220/adopt?tab=readme-ov-file#citation)                                                             |
 | FTRL          | *Follow The Regularized Leader*                                                                   |                                                                                                                | <https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41159.pdf> |                                                                                                                                     |
 | Cautious      | *Improving Training with One Line of Code*                                                        | [github](https://github.com/kyleliang919/C-Optim)                                                              | <https://arxiv.org/pdf/2411.16085v1>                                                        | [cite](https://github.com/kyleliang919/C-Optim?tab=readme-ov-file#citation)                                                         |
+| DeMo          | *Decoupled Momentum Optimization*                                                                 | [github](https://github.com/bloc97/DeMo)                                                                       | <https://arxiv.org/abs/2411.19870>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv241119870P/exportcitation)                                                        |
+| MicroAdam     | *Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence*                 | [github](https://github.com/IST-DASLab/MicroAdam)                                                              | <https://arxiv.org/abs/2405.15593>                                                          | [cite](https://github.com/IST-DASLab/MicroAdam?tab=readme-ov-file#citing)                                                           |
 
 ## Supported LR Scheduler
 

diff --git a/docs/optimizer.md b/docs/optimizer.md
@@ -152,6 +152,10 @@
     :docstring:
     :members:
 
+::: pytorch_optimizer.DeMo
+    :docstring:
+    :members:
+
 ::: pytorch_optimizer.DiffGrad
     :docstring:
     :members:

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,12 +14,12 @@ keywords = [
     "AdaDelta", "AdaFactor", "AdaMax", "AdamG", "AdaMod", "AdaNorm", "AdaPNM", "AdaSmooth", "AdEMAMix", "ADOPT",
     "AdaHessian", "Adai", "Adalite", "AdaLomo", "AdamMini", "AdamP", "AdamS", "Adan", "AggMo", "Aida", "AliG", "Amos",
     "Apollo", "AvaGrad", "bSAM", "CAME", "DAdaptAdaGrad", "DAdaptAdam", "DAdaptAdan", "DAdaptSGD", "DAdaptLion",
-    "DiffGrad", "FAdam", "Fromage", "FTRL", "GaLore", "Gravity", "GrokFast", "GSAM", "Kate", "Lamb", "LARS", "Lion",
-    "LOMO", "Lookahead", "MADGRAD", "MSVAG", "Nero", "NovoGrad", "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "QHAdam",
-    "QHM", "RAdam", "Ranger", "Ranger21", "RotoGrad", "SAM", "ScheduleFreeSGD", "ScheduleFreeAdamW", "SGDP", "Shampoo",
-    "ScalableShampoo", "SGDW", "SignSGD", "SM3", "SOAP", "SopihaH", "SRMM", "StableAdamW", "SWATS", "Tiger", "TRAC",
-    "WSAM", "Yogi", "BCE", "BCEFocal", "Focal", "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard", "Bi-Tempered",
-    "Tversky", "FocalTversky", "LovaszHinge", "bitsandbytes", "WSD", "QGaLore",
+    "DeMo", "DiffGrad", "FAdam", "Fromage", "FTRL", "GaLore", "Gravity", "GrokFast", "GSAM", "Kate", "Lamb", "LARS",
+    "Lion", "LOMO", "Lookahead", "MADGRAD", "MSVAG", "Nero", "NovoGrad", "PAdam", "PCGrad", "PID", "PNM", "Prodigy",
+    "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21", "RotoGrad", "SAM", "ScheduleFreeSGD", "ScheduleFreeAdamW", "SGDP",
+    "Shampoo", "ScalableShampoo", "SGDW", "SignSGD", "SM3", "SOAP", "SopihaH", "SRMM", "StableAdamW", "SWATS", "Tiger",
+    "TRAC", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal", "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard",
+    "Bi-Tempered", "Tversky", "FocalTversky", "LovaszHinge", "bitsandbytes", "WSD", "QGaLore",
 ]
 classifiers = [
     "License :: OSI Approved :: Apache Software License",
@@ -130,6 +130,7 @@ testpaths = "tests"
 omit = [
     "./pytorch_optimizer/optimizer/rotograd.py",
     "./pytorch_optimizer/optimizer/adam_mini.py",
+    "./pytorch_optimizer/optimizer/demo.py",
 ]
 
 [build-system]

diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py
@@ -97,6 +97,7 @@
     DAdaptAdan,
     DAdaptLion,
     DAdaptSGD,
+    DeMo,
     DiffGrad,
     DynamicLossScaler,
     FAdam,