From 059893eaa95bf2857f27aa294bf59106cb1bd76e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 11 Apr 2022 11:31:18 -0500
Subject: [PATCH] Upgrade ensmallen to 2.19.0 (#49)

Co-authored-by: coatless <coatless@users.noreply.github.com>
---
 ChangeLog                                     |   7 +
 DESCRIPTION                                   |   2 +-
 NEWS.md                                       |  17 ++
 inst/include/ensmallen.hpp                    |   9 +-
 .../ensmallen_bits/ada_belief/ada_belief.hpp  | 186 ++++++++++++++++
 .../ada_belief/ada_belief_impl.hpp            |  44 ++++
 .../ada_belief/ada_belief_update.hpp          | 153 +++++++++++++
 .../ada_bound/ada_bound_update.hpp            |  30 +--
 .../ada_bound/ams_bound_update.hpp            |  30 +--
 .../ensmallen_bits/ada_sqrt/ada_sqrt.hpp      | 168 ++++++++++++++
 .../ensmallen_bits/ada_sqrt/ada_sqrt_impl.hpp |  38 ++++
 .../ada_sqrt/ada_sqrt_update.hpp              | 118 ++++++++++
 .../ensmallen_bits/adam/adam_update.hpp       |  25 +--
 .../ensmallen_bits/adam/adamax_update.hpp     |  21 +-
 .../ensmallen_bits/adam/amsgrad_update.hpp    |  25 +--
 .../ensmallen_bits/adam/nadam_update.hpp      |  28 +--
 .../ensmallen_bits/adam/nadamax_update.hpp    |  23 +-
 .../adam/optimisticadam_update.hpp            |  28 +--
 .../ensmallen_bits/demon_adam/demon_adam.hpp  | 207 ++++++++++++++++++
 .../demon_adam/demon_adam_update.hpp          | 169 ++++++++++++++
 .../ensmallen_bits/demon_sgd/demon_sgd.hpp    | 178 +++++++++++++++
 .../demon_sgd/demon_sgd_update.hpp            | 139 ++++++++++++
 inst/include/ensmallen_bits/ens_version.hpp   |  10 +-
 .../ensmallen_bits/ftml/ftml_update.hpp       |  22 +-
 .../fw/proximal/proximal_impl.hpp             |   2 +-
 .../ensmallen_bits/lbfgs/lbfgs_impl.hpp       |   4 +-
 .../ensmallen_bits/padam/padam_update.hpp     |  25 +--
 .../ensmallen_bits/qhadam/qhadam_update.hpp   |  25 +--
 inst/include/ensmallen_bits/sa/sa_impl.hpp    |   2 +-
 .../spalera_sgd/spalera_sgd_impl.hpp          |   1 -
 .../ensmallen_bits/swats/swats_update.hpp     |  26 +--
 inst/include/ensmallen_bits/yogi/yogi.hpp     | 189 ++++++++++++++++
 .../include/ensmallen_bits/yogi/yogi_impl.hpp |  44 ++++
 .../ensmallen_bits/yogi/yogi_update.hpp       | 146 ++++++++++++
 tools/HISTORYold.md                           |  23 ++
 35 files changed, 1951 insertions(+), 213 deletions(-)
 create mode 100644 inst/include/ensmallen_bits/ada_belief/ada_belief.hpp
 create mode 100644 inst/include/ensmallen_bits/ada_belief/ada_belief_impl.hpp
 create mode 100644 inst/include/ensmallen_bits/ada_belief/ada_belief_update.hpp
 create mode 100644 inst/include/ensmallen_bits/ada_sqrt/ada_sqrt.hpp
 create mode 100644 inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_impl.hpp
 create mode 100644 inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_update.hpp
 create mode 100644 inst/include/ensmallen_bits/demon_adam/demon_adam.hpp
 create mode 100644 inst/include/ensmallen_bits/demon_adam/demon_adam_update.hpp
 create mode 100644 inst/include/ensmallen_bits/demon_sgd/demon_sgd.hpp
 create mode 100644 inst/include/ensmallen_bits/demon_sgd/demon_sgd_update.hpp
 create mode 100644 inst/include/ensmallen_bits/yogi/yogi.hpp
 create mode 100644 inst/include/ensmallen_bits/yogi/yogi_impl.hpp
 create mode 100644 inst/include/ensmallen_bits/yogi/yogi_update.hpp

diff --git a/ChangeLog b/ChangeLog
index 4033cfd..5b2142b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2022-04-11  James Balamuta  <balamut2@illinois.edu>
+
+	* DESCRIPTION (Version): Release 2.19.0
+	* NEWS.md: Update for Ensmallen release 2.19.0
+	* inst/include/ensmallen_bits: Upgraded to Ensmallen 2.19.0
+	* inst/include/ensmallen.hpp: ditto
+
 2022-02-18  James Balamuta  <balamut2@illinois.edu>
 
 	* DESCRIPTION: Update URLs
diff --git a/DESCRIPTION b/DESCRIPTION
index ced4419..a23034a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: RcppEnsmallen
 Title: Header-Only C++ Mathematical Optimization Library for 'Armadillo'
-Version: 0.2.18.2.1
+Version: 0.2.19.0.1
 Authors@R: c(
     person("James Joseph", "Balamuta", email = "balamut2@illinois.edu", 
            role = c("aut", "cre", "cph"), 
diff --git a/NEWS.md b/NEWS.md
index 3637526..f2e8938 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,20 @@
+# RcppEnsmallen 0.2.19.0.1
+
+- Upgraded to ensmallen 2.19.0: "Eight Ball Deluxe" (2022-04-11)
+ - Added DemonSGD and DemonAdam optimizers
+    ([#211](https://github.com/mlpack/ensmallen/pull/211)).
+  - Fix bug with Adam-like optimizers not resetting when `resetPolicy` is `true`.
+    ([#340](https://github.com/mlpack/ensmallen/pull/340)).
+  - Add Yogi optimizer
+    ([#232](https://github.com/mlpack/ensmallen/pull/232)).
+  - Add AdaBelief optimizer
+    ([#233](https://github.com/mlpack/ensmallen/pull/233)).
+  - Add AdaSqrt optimizer
+    ([#234](https://github.com/mlpack/ensmallen/pull/234)).
+    
+  - Bump check for minimum supported version of Armadillo
+    ([#342](https://github.com/mlpack/ensmallen/pull/342)).
+
 # RcppEnsmallen 0.2.18.2.1
 
 - Upgraded to ensmallen 2.18.2: "Fairmount Bagel" (2022-02-14)
diff --git a/inst/include/ensmallen.hpp b/inst/include/ensmallen.hpp
index 010c5cb..a1338e8 100644
--- a/inst/include/ensmallen.hpp
+++ b/inst/include/ensmallen.hpp
@@ -29,8 +29,8 @@
   #error "please enable C++11/C++14 mode in your compiler"
 #endif
 
-#if ((ARMA_VERSION_MAJOR < 8) || ((ARMA_VERSION_MAJOR == 8) && (ARMA_VERSION_MINOR < 400)))
-  #error "need Armadillo version 8.400 or later"
+#if ((ARMA_VERSION_MAJOR < 9) || ((ARMA_VERSION_MAJOR == 9) && (ARMA_VERSION_MINOR < 800)))
+  #error "need Armadillo version 9.800 or later"
 #endif
 
 #include <cctype>
@@ -85,10 +85,14 @@
 
 #include "ensmallen_bits/problems/problems.hpp" // TODO: should move to another place
 
+#include "ensmallen_bits/ada_belief/ada_belief.hpp"
 #include "ensmallen_bits/ada_bound/ada_bound.hpp"
 #include "ensmallen_bits/ada_delta/ada_delta.hpp"
 #include "ensmallen_bits/ada_grad/ada_grad.hpp"
+#include "ensmallen_bits/ada_sqrt/ada_sqrt.hpp"
 #include "ensmallen_bits/adam/adam.hpp"
+#include "ensmallen_bits/demon_adam/demon_adam.hpp"
+#include "ensmallen_bits/demon_sgd/demon_sgd.hpp"
 #include "ensmallen_bits/qhadam/qhadam.hpp"
 #include "ensmallen_bits/aug_lagrangian/aug_lagrangian.hpp"
 #include "ensmallen_bits/bigbatch_sgd/bigbatch_sgd.hpp"
@@ -131,5 +135,6 @@
 #include "ensmallen_bits/svrg/svrg.hpp"
 #include "ensmallen_bits/swats/swats.hpp"
 #include "ensmallen_bits/wn_grad/wn_grad.hpp"
+#include "ensmallen_bits/yogi/yogi.hpp"
 
 #endif
diff --git a/inst/include/ensmallen_bits/ada_belief/ada_belief.hpp b/inst/include/ensmallen_bits/ada_belief/ada_belief.hpp
new file mode 100644
index 0000000..1a4b13c
--- /dev/null
+++ b/inst/include/ensmallen_bits/ada_belief/ada_belief.hpp
@@ -0,0 +1,186 @@
+/**
+ * @file ada_belief.hpp
+ * @author Marcus Edel
+ *
+ * Class wrapper for the AdaBelief update Policy. The intuition for AdaBelief is
+ * to adapt the stepsize according to the "belief" in the current gradient
+ * direction.
+ *
+ * ensmallen is free software; you may redistribute it and/or modify it under
+ * the terms of the 3-clause BSD license.  You should have received a copy of
+ * the 3-clause BSD license along with ensmallen.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+#ifndef ENSMALLEN_ADA_BELIEF_HPP
+#define ENSMALLEN_ADA_BELIEF_HPP
+
+#include <ensmallen_bits/sgd/sgd.hpp>
+#include "ada_belief_update.hpp"
+
+namespace ens {
+
+/**
+ * The intuition for AdaBelief is to adapt the stepsize according to the
+ * "belief" in the current gradient direction. For more information, see the
+ * following.
+ *
+ * @code
+ * @misc{zhuang2020adabelief,
+ *   title         = {AdaBelief Optimizer: Adapting Stepsizes by the Belief in
+ *                    Observed Gradients},
+ *   author        = {Juntang Zhuang and Tommy Tang and Sekhar Tatikonda and
+ *                    Nicha Dvornek and Yifan Ding and Xenophon Papademetris
+ *                    and James S. Duncan},
+ *   year          = {2020},
+ *   eprint        = {2010.07468},
+ *   archivePrefix = {arXiv},
+ * }
+ * @endcode
+ *
+ * AdaBelief can optimize differentiable separable functions. For more details,
+ * see the documentation on function types included with this distribution or
+ * on the ensmallen website.
+ */
+class AdaBelief
+{
+ public:
+  /**
+   * Construct the AdaBelief optimizer with the given function and parameters.
+   * AdaBelief is sensitive to its parameters and hence a good hyperparameter
+   * selection is necessary as its default may not fit every case.
+   *
+   * The maximum number of iterations refers to the maximum number of
+   * points that are processed (i.e., one iteration equals one point; one
+   * iteration does not equal one pass over the dataset).
+   *
+   * @param stepSize Step size for each iteration.
+   * @param batchSize Number of points to process in a single step.
+   * @param beta1 The exponential decay rate for the 1st moment estimates.
+   * @param beta2 The exponential decay rate for the 2nd moment estimates.
+   * @param epsilon A small constant for numerical stability.
+   * @param maxIterations Maximum number of iterations allowed (0 means no
+   *     limit).
+   * @param tolerance Maximum absolute tolerance to terminate algorithm.
+   * @param shuffle If true, the function order is shuffled; otherwise, each
+   *     function is visited in linear order.
+   * @param resetPolicy If true, parameters are reset before every Optimize
+   *     call; otherwise, their values are retained.
+   * @param exactObjective Calculate the exact objective (Default: estimate the
+   *        final objective obtained on the last pass over the data).
+   */
+  AdaBelief(const double stepSize = 0.001,
+            const size_t batchSize = 32,
+            const double beta1 = 0.9,
+            const double beta2 = 0.999,
+            const double epsilon = 1e-12,
+            const size_t maxIterations = 100000,
+            const double tolerance = 1e-5,
+            const bool shuffle = true,
+            const bool resetPolicy = true,
+            const bool exactObjective = false);
+
+  /**
+   * Optimize the given function using AdaBelief. The given starting point will
+   * be modified to store the finishing point of the algorithm, and the final
+   * objective value is returned.
+   *
+   * @tparam SeparableFunctionType Type of the function to optimize.
+   * @tparam MatType Type of matrix to optimize with.
+   * @tparam GradType Type of matrix to use to represent function gradients.
+   * @tparam CallbackTypes Types of callback functions.
+   * @param function Function to optimize.
+   * @param iterate Starting point (will be modified).
+   * @param callbacks Callback functions.
+   * @return Objective value of the final point.
+   */
+  template<typename SeparableFunctionType,
+           typename MatType,
+           typename GradType,
+           typename... CallbackTypes>
+  typename std::enable_if<IsArmaType<GradType>::value,
+      typename MatType::elem_type>::type
+  Optimize(SeparableFunctionType& function,
+           MatType& iterate,
+           CallbackTypes&&... callbacks)
+  {
+    return optimizer.Optimize<SeparableFunctionType, MatType, GradType,
+        CallbackTypes...>(function, iterate,
+        std::forward<CallbackTypes>(callbacks)...);
+  }
+
+  //! Forward the MatType as GradType.
+  template<typename SeparableFunctionType,
+           typename MatType,
+           typename... CallbackTypes>
+  typename MatType::elem_type Optimize(SeparableFunctionType& function,
+                                       MatType& iterate,
+                                       CallbackTypes&&... callbacks)
+  {
+    return Optimize<SeparableFunctionType, MatType, MatType,
+        CallbackTypes...>(function, iterate,
+        std::forward<CallbackTypes>(callbacks)...);
+  }
+
+  //! Get the step size.
+  double StepSize() const { return optimizer.StepSize(); }
+  //! Modify the step size.
+  double& StepSize() { return optimizer.StepSize(); }
+
+  //! Get the batch size.
+  size_t BatchSize() const { return optimizer.BatchSize(); }
+  //! Modify the batch size.
+  size_t& BatchSize() { return optimizer.BatchSize(); }
+
+  //! Get the exponential decay rate for the 1st moment estimates.
+  double Beta1() const { return optimizer.UpdatePolicy().Beta1(); }
+  //! Modify the exponential decay rate for the 1st moment estimates.
+  double& Beta1() { return optimizer.UpdatePolicy().Beta1(); }
+
+  //! Get the exponential decay rate for the 2nd moment estimates.
+  double Beta2() const { return optimizer.UpdatePolicy().Beta2(); }
+  //! Get the second moment coefficient.
+  double& Beta2() { return optimizer.UpdatePolicy().Beta2(); }
+
+  //! Get the value for numerical stability.
+  double Epsilon() const { return optimizer.UpdatePolicy().Epsilon(); }
+  //! Modify the value used for numerical stability.
+  double& Epsilon() { return optimizer.UpdatePolicy().Epsilon(); }
+
+  //! Get the maximum number of iterations (0 indicates no limit).
+  size_t MaxIterations() const { return optimizer.MaxIterations(); }
+  //! Modify the maximum number of iterations (0 indicates no limit).
+  size_t& MaxIterations() { return optimizer.MaxIterations(); }
+
+  //! Get the tolerance for termination.
+  double Tolerance() const { return optimizer.Tolerance(); }
+  //! Modify the tolerance for termination.
+  double& Tolerance() { return optimizer.Tolerance(); }
+
+  //! Get whether or not the individual functions are shuffled.
+  bool Shuffle() const { return optimizer.Shuffle(); }
+  //! Modify whether or not the individual functions are shuffled.
+  bool& Shuffle() { return optimizer.Shuffle(); }
+
+  //! Get whether or not the actual objective is calculated.
+  bool ExactObjective() const { return optimizer.ExactObjective(); }
+  //! Modify whether or not the actual objective is calculated.
+  bool& ExactObjective() { return optimizer.ExactObjective(); }
+
+  //! Get whether or not the update policy parameters are reset before
+  //! Optimize call.
+  bool ResetPolicy() const { return optimizer.ResetPolicy(); }
+  //! Modify whether or not the update policy parameters
+  //! are reset before Optimize call.
+  bool& ResetPolicy() { return optimizer.ResetPolicy(); }
+
+  private:
+  //! The Stochastic Gradient Descent object with AdaBelief policy.
+  SGD<AdaBeliefUpdate> optimizer;
+};
+
+} // namespace ens
+
+// Include implementation.
+#include "ada_belief_impl.hpp"
+
+#endif
diff --git a/inst/include/ensmallen_bits/ada_belief/ada_belief_impl.hpp b/inst/include/ensmallen_bits/ada_belief/ada_belief_impl.hpp
new file mode 100644
index 0000000..485efe5
--- /dev/null
+++ b/inst/include/ensmallen_bits/ada_belief/ada_belief_impl.hpp
@@ -0,0 +1,44 @@
+/**
+ * @file ada_belief_impl.hpp
+ * @author Marcus Edel
+ *
+ * Implementation of AdaBelief class wrapper.
+ *
+ * ensmallen is free software; you may redistribute it and/or modify it under
+ * the terms of the 3-clause BSD license.  You should have received a copy of
+ * the 3-clause BSD license along with ensmallen.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+#ifndef ENSMALLEN_ADA_BELIEF_ADA_BELIEF_IMPL_HPP
+#define ENSMALLEN_ADA_BELIEF_ADA_BELIEF_IMPL_HPP
+
+// In case it hasn't been included yet.
+#include "ada_belief.hpp"
+
+namespace ens {
+
+inline AdaBelief::AdaBelief(
+    const double stepSize,
+    const size_t batchSize,
+    const double beta1,
+    const double beta2,
+    const double epsilon,
+    const size_t maxIterations,
+    const double tolerance,
+    const bool shuffle,
+    const bool resetPolicy,
+    const bool exactObjective) :
+    optimizer(stepSize,
+              batchSize,
+              maxIterations,
+              tolerance,
+              shuffle,
+              AdaBeliefUpdate(epsilon, beta1, beta2),
+              NoDecay(),
+              resetPolicy,
+              exactObjective)
+{ /* Nothing to do. */ }
+
+} // namespace ens
+
+ #endif
diff --git a/inst/include/ensmallen_bits/ada_belief/ada_belief_update.hpp b/inst/include/ensmallen_bits/ada_belief/ada_belief_update.hpp
new file mode 100644
index 0000000..f768987
--- /dev/null
+++ b/inst/include/ensmallen_bits/ada_belief/ada_belief_update.hpp
@@ -0,0 +1,153 @@
+/**
+ * @file ada_belief_update.hpp
+ * @author Marcus Edel
+ *
+ * AdaBelief optimizer update policy. The intuition for AdaBelief is to adapt
+ * the stepsize according to the "belief" in the current gradient direction.
+ *
+ * ensmallen is free software; you may redistribute it and/or modify it under
+ * the terms of the 3-clause BSD license.  You should have received a copy of
+ * the 3-clause BSD license along with ensmallen.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+#ifndef ENSMALLEN_ADA_BELIEF_ADA_BELIEF_UPDATE_HPP
+#define ENSMALLEN_ADA_BELIEF_ADA_BELIEF_UPDATE_HPP
+
+namespace ens {
+
+/**
+ * The intuition for AdaBelief is to adapt the stepsize according to the
+ * "belief" in the current gradient direction.
+ *
+ * For more information, see the following.
+ *
+ * @code
+ * @misc{zhuang2020adabelief,
+ *   title         = {AdaBelief Optimizer: Adapting Stepsizes by the Belief in
+ *                    Observed Gradients},
+ *   author        = {Juntang Zhuang and Tommy Tang and Sekhar Tatikonda and
+ *                    Nicha Dvornek and Yifan Ding and Xenophon Papademetris
+ *                    and James S. Duncan},
+ *   year          = {2020},
+ *   eprint        = {2010.07468},
+ *   archivePrefix = {arXiv},
+ * }
+ * @endcode
+ */
+class AdaBeliefUpdate
+{
+ public:
+  /**
+   * Construct the AdaBelief update policy with the given parameters.
+   *
+   * @param epsilon A small constant for numerical stability.
+   * @param beta1 The exponential decay rate for the 1st moment estimates.
+   * @param beta2 The exponential decay rate for the 2nd moment estimates.
+   */
+  AdaBeliefUpdate(const double epsilon = 1e-8,
+                  const double beta1 = 0.9,
+                  const double beta2 = 0.999) :
+    epsilon(epsilon),
+    beta1(beta1),
+    beta2(beta2)
+  {
+    // Nothing to do.
+  }
+
+  //! Get the value for numerical stability.
+  double Epsilon() const { return epsilon; }
+  //! Modify the value used for numerical stability.
+  double& Epsilon() { return epsilon; }
+
+  //! Get the exponential decay rate for the 1st moment estimates.
+  double Beta1() const { return beta1; }
+  //! Modify the exponential decay rate for the 1st moment estimates.
+  double& Beta1() { return beta1; }
+
+  //! Get the exponential decay rate for the 2nd moment estimates.
+  double Beta2() const { return beta2; }
+  //! Modify the exponential decay rate for the 2nd moment estimates.
+  double& Beta2() { return beta2; }
+
+  /**
+   * The UpdatePolicyType policy classes must contain an internal 'Policy'
+   * template class with two template arguments: MatType and GradType.  This is
+   * instantiated at the start of the optimization, and holds parameters
+   * specific to an individual optimization.
+   */
+  template<typename MatType, typename GradType>
+  class Policy
+  {
+   public:
+    /**
+     * This constructor is called by the SGD Optimize() method before the start
+     * of the iteration update process.
+     *
+     * @param parent AdaBeliefUpdate object.
+     * @param rows Number of rows in the gradient matrix.
+     * @param cols Number of columns in the gradient matrix.
+     */
+    Policy(AdaBeliefUpdate& parent, const size_t rows, const size_t cols) :
+        parent(parent),
+        iteration(0)
+    {
+      m.zeros(rows, cols);
+      s.zeros(rows, cols);
+    }
+
+    /**
+     * Update step for AdaBelief.
+     *
+     * @param iterate Parameters that minimize the function.
+     * @param stepSize Step size to be used for the given iteration.
+     * @param gradient The gradient matrix.
+     */
+    void Update(MatType& iterate,
+                const double stepSize,
+                const GradType& gradient)
+    {
+      // Increment the iteration counter variable.
+      ++iteration;
+
+      m *= parent.beta1;
+      m += (1 - parent.beta1) * gradient;
+
+      s *= parent.beta2;
+      s += (1 - parent.beta2) * arma::pow(gradient - m, 2.0) + parent.epsilon;
+
+      const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration);
+      const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration);
+
+      // And update the iterate.
+      iterate -= ((m / biasCorrection1) * stepSize) / (arma::sqrt(s /
+          biasCorrection2) + parent.epsilon);
+    }
+
+   private:
+    //! Instantiated parent object.
+    AdaBeliefUpdate& parent;
+
+    //! The exponential moving average of gradient values.
+    GradType m;
+
+    // The exponential moving average of squared gradient values.
+    GradType s;
+
+    // The number of iterations.
+    size_t iteration;
+  };
+
+ private:
+  // The epsilon value used to initialise the squared gradient parameter.
+  double epsilon;
+
+  // The xponential decay rate for the 1st moment estimates.
+  double beta1;
+
+  // The exponential decay rate for the 2nd moment estimates.
+  double beta2;
+};
+
+} // namespace ens
+
+#endif
diff --git a/inst/include/ensmallen_bits/ada_bound/ada_bound_update.hpp b/inst/include/ensmallen_bits/ada_bound/ada_bound_update.hpp
index 4aeeaab..3a84d87 100644
--- a/inst/include/ensmallen_bits/ada_bound/ada_bound_update.hpp
+++ b/inst/include/ensmallen_bits/ada_bound/ada_bound_update.hpp
@@ -56,8 +56,7 @@ class AdaBoundUpdate
     gamma(gamma),
     epsilon(epsilon),
     beta1(beta1),
-    beta2(beta2),
-    iteration(0)
+    beta2(beta2)
   {
     // Nothing to do.
   }
@@ -87,11 +86,6 @@ class AdaBoundUpdate
   //! Modify the second moment coefficient.
   double& Beta2() { return beta2; }
 
-  //! Get the current iteration number.
-  size_t Iteration() const { return iteration; }
-  //! Modify the current iteration number.
-  size_t& Iteration() { return iteration; }
-
   /**
    * The UpdatePolicyType policy classes must contain an internal 'Policy'
    * template class with two template arguments: MatType and GradType.  This is
@@ -111,7 +105,7 @@ class AdaBoundUpdate
      * @param cols Number of columns in the gradient matrix.
      */
     Policy(AdaBoundUpdate& parent, const size_t rows, const size_t cols) :
-        parent(parent), first(true), initialStepSize(0)
+        parent(parent), first(true), initialStepSize(0), iteration(0)
     {
       m.zeros(rows, cols);
       v.zeros(rows, cols);
@@ -139,7 +133,7 @@ class AdaBoundUpdate
       }
 
       // Increment the iteration counter variable.
-      ++parent.iteration;
+      ++iteration;
 
       // Decay the first and second moment running average coefficient.
       m *= parent.beta1;
@@ -148,16 +142,12 @@ class AdaBoundUpdate
       v *= parent.beta2;
       v += (1 - parent.beta2) * (gradient % gradient);
 
-      const ElemType biasCorrection1 = 1.0 - std::pow(parent.beta1,
-          parent.iteration);
-      const ElemType biasCorrection2 = 1.0 - std::pow(parent.beta2,
-          parent.iteration);
+      const ElemType biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration);
+      const ElemType biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration);
 
       const ElemType fl = parent.finalLr * stepSize / initialStepSize;
-      const ElemType lower = fl * (1.0 - 1.0 / (parent.gamma *
-          parent.iteration + 1));
-      const ElemType upper = fl * (1.0 + 1.0 / (parent.gamma *
-          parent.iteration));
+      const ElemType lower = fl * (1.0 - 1.0 / (parent.gamma * iteration + 1));
+      const ElemType upper = fl * (1.0 + 1.0 / (parent.gamma * iteration));
 
        // Applies bounds on actual learning rate.
       iterate -= arma::clamp((stepSize *
@@ -180,6 +170,9 @@ class AdaBoundUpdate
 
     // The initial (Adam) learning rate.
     double initialStepSize;
+
+    // The number of iterations.
+    size_t iteration;
   };
 
  private:
@@ -197,9 +190,6 @@ class AdaBoundUpdate
 
   // The second moment coefficient.
   double beta2;
-
-  // The number of iterations.
-  size_t iteration;
 };
 
 } // namespace ens
diff --git a/inst/include/ensmallen_bits/ada_bound/ams_bound_update.hpp b/inst/include/ensmallen_bits/ada_bound/ams_bound_update.hpp
index b2d1b98..270f8eb 100644
--- a/inst/include/ensmallen_bits/ada_bound/ams_bound_update.hpp
+++ b/inst/include/ensmallen_bits/ada_bound/ams_bound_update.hpp
@@ -56,8 +56,7 @@ class AMSBoundUpdate
       gamma(gamma),
       epsilon(epsilon),
       beta1(beta1),
-      beta2(beta2),
-      iteration(0)
+      beta2(beta2)
   {
     // Nothing to do.
   }
@@ -87,11 +86,6 @@ class AMSBoundUpdate
   //! Modify the second moment coefficient.
   double& Beta2() { return beta2; }
 
-  //! Get the current iteration number.
-  size_t Iteration() const { return iteration; }
-  //! Modify the current iteration number.
-  size_t& Iteration() { return iteration; }
-
   /**
    * The UpdatePolicyType policy classes must contain an internal 'Policy'
    * template class with two template arguments: MatType and GradType.  This is
@@ -111,7 +105,7 @@ class AMSBoundUpdate
      * @param cols Number of columns in the gradient matrix.
      */
     Policy(AMSBoundUpdate& parent, const size_t rows, const size_t cols) :
-        parent(parent), first(true), initialStepSize(0)
+        parent(parent), first(true), initialStepSize(0), iteration(0)
     {
       m.zeros(rows, cols);
       v.zeros(rows, cols);
@@ -140,7 +134,7 @@ class AMSBoundUpdate
       }
 
       // Increment the iteration counter variable.
-      ++parent.iteration;
+      ++iteration;
 
       // Decay the first and second moment running average coefficient.
       m *= parent.beta1;
@@ -149,16 +143,12 @@ class AMSBoundUpdate
       v *= parent.beta2;
       v += (1 - parent.beta2) * (gradient % gradient);
 
-      const ElemType biasCorrection1 = 1.0 - std::pow(parent.beta1,
-          parent.iteration);
-      const ElemType biasCorrection2 = 1.0 - std::pow(parent.beta2,
-          parent.iteration);
+      const ElemType biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration);
+      const ElemType biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration);
 
       const ElemType fl = parent.finalLr * stepSize / initialStepSize;
-      const ElemType lower = fl * (1.0 - 1.0 / (parent.gamma *
-          parent.iteration + 1));
-      const ElemType upper = fl * (1.0 + 1.0 / (parent.gamma *
-          parent.iteration));
+      const ElemType lower = fl * (1.0 - 1.0 / (parent.gamma * iteration + 1));
+      const ElemType upper = fl * (1.0 + 1.0 / (parent.gamma * iteration));
 
       // Element wise maximum of past and present squared gradients.
       vImproved = arma::max(vImproved, v);
@@ -187,6 +177,9 @@ class AMSBoundUpdate
 
     // The optimal squared gradient value.
     GradType vImproved;
+
+    // The number of iterations.
+    size_t iteration;
   };
 
  private:
@@ -204,9 +197,6 @@ class AMSBoundUpdate
 
   // The second moment coefficient.
   double beta2;
-
-  // The number of iterations.
-  size_t iteration;
 };
 
 } // namespace ens
diff --git a/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt.hpp b/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt.hpp
new file mode 100644
index 0000000..7f1788c
--- /dev/null
+++ b/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt.hpp
@@ -0,0 +1,168 @@
+/**
+ * @file ada_sqrt.hpp
+ * @author Marcus Edel
+ *
+ * Implementation of the AdaSqrt optimizer. AdaSqrt is an optimizer that
+ * chooses learning rate dynamically by adapting to the data and iteration.
+ *
+ * ensmallen is free software; you may redistribute it and/or modify it under
+ * terms of the 3-clause BSD license. You should have received a copy of the
+ * 3-clause BSD license along with ensmallen. If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+#ifndef ENSMALLEN_ADA_SQRT_ADA_SQRT_HPP
+#define ENSMALLEN_ADA_SQRT_ADA_SQRT_HPP
+
+#include "../sgd/sgd.hpp"
+#include "ada_sqrt_update.hpp"
+
+namespace ens {
+
+/**
+ * AdaSqrt is a modified version of stochastic gradient descent which performs
+ * larger updates for more sparse parameters and smaller updates for less sparse
+ * parameters.
+ *
+ * For more information, see the following.
+ *
+ * @code
+ * @misc{hu2019secondorder,
+ *   title  = {Second-order Information in First-order Optimization Methods},
+ *   author = {Yuzheng Hu and Licong Lin and Shange Tang},
+ *   year   = {2019},
+ *   eprint = {1912.09926},
+ * }
+ * @endcode
+ *
+ * AdaSqrt can optimize differentiable separable functions.  For more details,
+ * see the documentation on function types included with this distribution or on
+ * the ensmallen website.
+ */
+class AdaSqrt
+{
+ public:
+  /**
+   * Construct the AdaSqrt optimizer with the given function and parameters.
+   * The defaults here are not necessarily good for the given problem, so it is
+   * suggested that the values used be tailored to the task at hand. The
+   * maximum number of iterations refers to the maximum number of points that
+   * are processed (i.e., one iteration equals one point; one iteration does not
+   * equal one pass over the dataset).
+   *
+   * @param stepSize Step size for each iteration.
+   * @param batchSize Number of points to process in one step.
+   * @param epsilon Value used to initialise the squared gradient parameter.
+   * @param maxIterations Maximum number of iterations allowed (0 means no
+   *        limit).
+   * @param tolerance Maximum absolute tolerance to terminate algorithm.
+   * @param shuffle If true, the function order is shuffled; otherwise, each
+   *        function is visited in linear order.
+   * @param resetPolicy If true, parameters are reset before every Optimize
+   *        call; otherwise, their values are retained.
+   * @param exactObjective Calculate the exact objective (Default: estimate the
+   *        final objective obtained on the last pass over the data).
+   */
+  AdaSqrt(const double stepSize = 0.01,
+          const size_t batchSize = 32,
+          const double epsilon = 1e-8,
+          const size_t maxIterations = 100000,
+          const double tolerance = 1e-5,
+          const bool shuffle = true,
+          const bool resetPolicy = true,
+          const bool exactObjective = false);
+
+  /**
+   * Optimize the given function using AdaSqrt. The given starting point will
+   * be modified to store the finishing point of the algorithm, and the final
+   * objective value is returned.
+   *
+   * @tparam SeparableFunctionType Type of the function to be optimized.
+   * @tparam MatType Type of matrix to optimize with.
+   * @tparam GradType Type of matrix to use to represent function gradients.
+   * @tparam CallbackTypes Types of callback functions.
+   * @param function Function to optimize.
+   * @param iterate Starting point (will be modified).
+   * @param callbacks Callback functions.
+   * @return Objective value of the final point.
+   */
+  template<typename SeparableFunctionType,
+           typename MatType,
+           typename GradType,
+           typename... CallbackTypes>
+  typename std::enable_if<IsArmaType<GradType>::value,
+      typename MatType::elem_type>::type
+  Optimize(SeparableFunctionType& function,
+           MatType& iterate,
+           CallbackTypes&&... callbacks)
+  {
+    return optimizer.Optimize<SeparableFunctionType, MatType, GradType,
+        CallbackTypes...>(function, iterate,
+        std::forward<CallbackTypes>(callbacks)...);
+  }
+
+  //! Forward the MatType as GradType.
+  template<typename SeparableFunctionType,
+           typename MatType,
+           typename... CallbackTypes>
+  typename MatType::elem_type Optimize(SeparableFunctionType& function,
+                                       MatType& iterate,
+                                       CallbackTypes&&... callbacks)
+  {
+    return Optimize<SeparableFunctionType, MatType, MatType,
+        CallbackTypes...>(function, iterate,
+        std::forward<CallbackTypes>(callbacks)...);
+  }
+
+  //! Get the step size.
+  double StepSize() const { return optimizer.StepSize(); }
+  //! Modify the step size.
+  double& StepSize() { return optimizer.StepSize(); }
+
+  //! Get the batch size.
+  size_t BatchSize() const { return optimizer.BatchSize(); }
+  //! Modify the batch size.
+  size_t& BatchSize() { return optimizer.BatchSize(); }
+
+  //! Get the value used to initialise the squared gradient parameter.
+  double Epsilon() const { return optimizer.UpdatePolicy().Epsilon(); }
+  //! Modify the value used to initialise the squared gradient parameter.
+  double& Epsilon() { return optimizer.UpdatePolicy().Epsilon(); }
+
+  //! Get the maximum number of iterations (0 indicates no limit).
+  size_t MaxIterations() const { return optimizer.MaxIterations(); }
+  //! Modify the maximum number of iterations (0 indicates no limit).
+  size_t& MaxIterations() { return optimizer.MaxIterations(); }
+
+  //! Get the tolerance for termination.
+  double Tolerance() const { return optimizer.Tolerance(); }
+  //! Modify the tolerance for termination.
+  double& Tolerance() { return optimizer.Tolerance(); }
+
+  //! Get whether or not the individual functions are shuffled.
+  bool Shuffle() const { return optimizer.Shuffle(); }
+  //! Modify whether or not the individual functions are shuffled.
+  bool& Shuffle() { return optimizer.Shuffle(); }
+
+  //! Get whether or not the actual objective is calculated.
+  bool ExactObjective() const { return optimizer.ExactObjective(); }
+  //! Modify whether or not the actual objective is calculated.
+  bool& ExactObjective() { return optimizer.ExactObjective(); }
+
+  //! Get whether or not the update policy parameters
+  //! are reset before Optimize call.
+  bool ResetPolicy() const { return optimizer.ResetPolicy(); }
+  //! Modify whether or not the update policy parameters
+  //! are reset before Optimize call.
+  bool& ResetPolicy() { return optimizer.ResetPolicy(); }
+
+ private:
+  //! The Stochastic Gradient Descent object with AdaSqrt policy.
+  SGD<AdaSqrtUpdate> optimizer;
+};
+
+} // namespace ens
+
+// Include implementation.
+#include "ada_sqrt_impl.hpp"
+
+#endif
diff --git a/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_impl.hpp b/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_impl.hpp
new file mode 100644
index 0000000..1bc36d9
--- /dev/null
+++ b/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_impl.hpp
@@ -0,0 +1,38 @@
+/**
+ * @file ada_sqrt_impl.hpp
+ * @author Marcus Edel
+ *
+ * Implementation of AdaSqrt optimizer.
+ *
+ * ensmallen is free software; you may redistribute it and/or modify it under
+ * the terms of the 3-clause BSD license.  You should have received a copy of
+ * the 3-clause BSD license along with ensmallen.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+#ifndef ENSMALLEN_ADA_SQRT_ADA_SQRT_IMPL_HPP
+#define ENSMALLEN_ADA_SQRT_ADA_SQRT_IMPL_HPP
+
+namespace ens {
+
+inline AdaSqrt::AdaSqrt(const double stepSize,
+                        const size_t batchSize,
+                        const double epsilon,
+                        const size_t maxIterations,
+                        const double tolerance,
+                        const bool shuffle,
+                        const bool resetPolicy,
+                        const bool exactObjective) :
+    optimizer(stepSize,
+              batchSize,
+              maxIterations,
+              tolerance,
+              shuffle,
+              AdaSqrtUpdate(epsilon),
+              NoDecay(),
+              resetPolicy,
+              exactObjective)
+{ /* Nothing to do. */ }
+
+} // namespace ens
+
+#endif
diff --git a/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_update.hpp b/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_update.hpp
new file mode 100644
index 0000000..feae24c
--- /dev/null
+++ b/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_update.hpp
@@ -0,0 +1,118 @@
+/**
+ * @file ada_sqrt_update.hpp
+ * @author Marcus Edel
+ *
+ * AdaSqrt update for Stochastic Gradient Descent.
+ *
+ * ensmallen is free software; you may redistribute it and/or modify it under
+ * the terms of the 3-clause BSD license.  You should have received a copy of
+ * the 3-clause BSD license along with ensmallen.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+#ifndef ENSMALLEN_ADA_SQRT_ADA_SQRT_UPDATE_HPP
+#define ENSMALLEN_ADA_SQRT_ADA_SQRT_UPDATE_HPP
+
+namespace ens {
+
+/**
+ * Implementation of the AdaSqrt update policy. AdaSqrt update policy chooses
+ * learning rate dynamically by adapting to the data and iteration.
+ *
+ * For more information, see the following.
+ *
+ * @code
+ * @misc{hu2019secondorder,
+ *   title  = {Second-order Information in First-order Optimization Methods},
+ *   author = {Yuzheng Hu and Licong Lin and Shange Tang},
+ *   year   = {2019},
+ *   eprint = {1912.09926},
+ * }
+ * @endcode
+ *
+ */
+class AdaSqrtUpdate
+{
+ public:
+  /**
+   * Construct the AdaSqrt update policy with given epsilon parameter.
+   *
+   * @param epsilon The epsilon value used to initialise the squared gradient
+   *        parameter.
+   */
+  AdaSqrtUpdate(const double epsilon = 1e-8) : epsilon(epsilon)
+  {
+    // Nothing to do.
+  }
+
+  //! Get the value used to initialise the squared gradient parameter.
+  double Epsilon() const { return epsilon; }
+  //! Modify the value used to initialise the squared gradient parameter.
+  double& Epsilon() { return epsilon; }
+
+  /**
+   * The UpdatePolicyType policy classes must contain an internal 'Policy'
+   * template class with two template arguments: MatType and GradType.  This is
+   * instantiated at the start of the optimization, and holds parameters
+   * specific to an individual optimization.
+   */
+  template<typename MatType, typename GradType>
+  class Policy
+  {
+   public:
+    /**
+     * This constructor is called by the SGD optimizer before the start of the
+     * iteration update process. In AdaSqrt update policy, squared gradient
+     * matrix is initialized to the zeros matrix with the same size as gradient
+     * matrix (see ens::SGD<>).
+     *
+     * @param parent Instantiated parent class.
+     * @param rows Number of rows in the gradient matrix.
+     * @param cols Number of columns in the gradient matrix.
+     */
+    Policy(AdaSqrtUpdate& parent, const size_t rows, const size_t cols) :
+        parent(parent),
+        squaredGradient(rows, cols),
+        iteration(0)
+    {
+      // Initialize an empty matrix for sum of squares of parameter gradient.
+      squaredGradient.zeros();
+    }
+
+    /**
+     * Update step for SGD. The AdaSqrt update adapts the learning rate by
+     * performing larger updates for more sparse parameters and smaller updates
+     * for less sparse parameters.
+     *
+     * @param iterate Parameters that minimize the function.
+     * @param stepSize Step size to be used for the given iteration.
+     * @param gradient The gradient matrix.
+     */
+    void Update(MatType& iterate,
+                const double stepSize,
+                const GradType& gradient)
+    {
+      ++iteration;
+
+      squaredGradient += arma::square(gradient);
+
+      iterate -= stepSize * std::sqrt(iteration) * gradient /
+          (squaredGradient + parent.epsilon);
+    }
+
+   private:
+    // Instantiated parent class.
+    AdaSqrtUpdate& parent;
+    // The squared gradient matrix.
+    GradType squaredGradient;
+    // The number of iterations.
+    size_t iteration;
+  };
+
+ private:
+  // The epsilon value used to initialise the squared gradient parameter.
+  double epsilon;
+};
+
+} // namespace ens
+
+#endif
diff --git a/inst/include/ensmallen_bits/adam/adam_update.hpp b/inst/include/ensmallen_bits/adam/adam_update.hpp
index 8831c7b..de7f61e 100644
--- a/inst/include/ensmallen_bits/adam/adam_update.hpp
+++ b/inst/include/ensmallen_bits/adam/adam_update.hpp
@@ -52,8 +52,7 @@ class AdamUpdate
              const double beta2 = 0.999) :
     epsilon(epsilon),
     beta1(beta1),
-    beta2(beta2),
-    iteration(0)
+    beta2(beta2)
   {
     // Nothing to do.
   }
@@ -73,11 +72,6 @@ class AdamUpdate
   //! Modify the second moment coefficient.
   double& Beta2() { return beta2; }
 
-  //! Get the current iteration number.
-  size_t Iteration() const { return iteration; }
-  //! Modify the current iteration number.
-  size_t& Iteration() { return iteration; }
-
   /**
    * The UpdatePolicyType policy classes must contain an internal 'Policy'
    * template class with two template arguments: MatType and GradType.  This is
@@ -97,7 +91,8 @@ class AdamUpdate
      * @param cols Number of columns in the gradient matrix.
      */
     Policy(AdamUpdate& parent, const size_t rows, const size_t cols) :
-        parent(parent)
+        parent(parent),
+        iteration(0)
     {
       m.zeros(rows, cols);
       v.zeros(rows, cols);
@@ -115,7 +110,7 @@ class AdamUpdate
                 const GradType& gradient)
     {
       // Increment the iteration counter variable.
-      ++parent.iteration;
+      ++iteration;
 
       // And update the iterate.
       m *= parent.beta1;
@@ -124,10 +119,8 @@ class AdamUpdate
       v *= parent.beta2;
       v += (1 - parent.beta2) * (gradient % gradient);
 
-      const double biasCorrection1 = 1.0 - std::pow(parent.beta1,
-          parent.iteration);
-      const double biasCorrection2 = 1.0 - std::pow(parent.beta2,
-          parent.iteration);
+      const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration);
+      const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration);
 
       /**
        * It should be noted that the term, m / (arma::sqrt(v) + eps), in the
@@ -147,6 +140,9 @@ class AdamUpdate
 
     // The exponential moving average of squared gradient values.
     GradType v;
+
+    // The number of iterations.
+    size_t iteration;
   };
 
  private:
@@ -158,9 +154,6 @@ class AdamUpdate
 
   // The second moment coefficient.
   double beta2;
-
-  // The number of iterations.
-  size_t iteration;
 };
 
 } // namespace ens
diff --git a/inst/include/ensmallen_bits/adam/adamax_update.hpp b/inst/include/ensmallen_bits/adam/adamax_update.hpp
index 38ef910..a6c9f2f 100644
--- a/inst/include/ensmallen_bits/adam/adamax_update.hpp
+++ b/inst/include/ensmallen_bits/adam/adamax_update.hpp
@@ -54,8 +54,7 @@ class AdaMaxUpdate
                const double beta2 = 0.999) :
     epsilon(epsilon),
     beta1(beta1),
-    beta2(beta2),
-    iteration(0)
+    beta2(beta2)
   {
     // Nothing to do.
   }
@@ -75,11 +74,6 @@ class AdaMaxUpdate
   //! Modify the second moment coefficient.
   double& Beta2() { return beta2; }
 
-  //! Get the current iteration number.
-  size_t Iteration() const { return iteration; }
-  //! Modify the current iteration number.
-  size_t& Iteration() { return iteration; }
-
   /**
    * The UpdatePolicyType policy classes must contain an internal 'Policy'
    * template class with two template arguments: MatType and GradType.  This is
@@ -99,7 +93,8 @@ class AdaMaxUpdate
      * @param cols Number of columns in the gradient matrix.
      */
     Policy(AdaMaxUpdate& parent, const size_t rows, const size_t cols) :
-        parent(parent)
+        parent(parent),
+        iteration(0)
     {
       m.zeros(rows, cols);
       u.zeros(rows, cols);
@@ -117,7 +112,7 @@ class AdaMaxUpdate
                 const GradType& gradient)
     {
       // Increment the iteration counter variable.
-      ++parent.iteration;
+      ++iteration;
 
       // And update the iterate.
       m *= parent.beta1;
@@ -127,8 +122,7 @@ class AdaMaxUpdate
       u *= parent.beta2;
       u = arma::max(u, arma::abs(gradient));
 
-      const double biasCorrection1 = 1.0 - std::pow(parent.beta1,
-          parent.iteration);
+      const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration);
 
       if (biasCorrection1 != 0)
         iterate -= (stepSize / biasCorrection1 * m / (u + parent.epsilon));
@@ -141,6 +135,8 @@ class AdaMaxUpdate
     GradType m;
     // The exponentially weighted infinity norm.
     GradType u;
+    // The number of iterations.
+    size_t iteration;
   };
 
  private:
@@ -152,9 +148,6 @@ class AdaMaxUpdate
 
   // The second moment coefficient.
   double beta2;
-
-  // The number of iterations.
-  size_t iteration;
 };
 
 } // namespace ens
diff --git a/inst/include/ensmallen_bits/adam/amsgrad_update.hpp b/inst/include/ensmallen_bits/adam/amsgrad_update.hpp
index d49314f..f1f420e 100644
--- a/inst/include/ensmallen_bits/adam/amsgrad_update.hpp
+++ b/inst/include/ensmallen_bits/adam/amsgrad_update.hpp
@@ -47,8 +47,7 @@ class AMSGradUpdate
                 const double beta2 = 0.999) :
     epsilon(epsilon),
     beta1(beta1),
-    beta2(beta2),
-    iteration(0)
+    beta2(beta2)
   {
     // Nothing to do.
   }
@@ -68,11 +67,6 @@ class AMSGradUpdate
   //! Modify the second moment coefficient.
   double& Beta2() { return beta2; }
 
-  //! Get the current iteration number.
-  size_t Iteration() const { return iteration; }
-  //! Modify the current iteration number.
-  size_t& Iteration() { return iteration; }
-
   /**
    * The UpdatePolicyType policy classes must contain an internal 'Policy'
    * template class with two template arguments: MatType and GradType.  This is
@@ -92,7 +86,8 @@ class AMSGradUpdate
      * @param cols Number of columns in the gradient matrix.
      */
     Policy(AMSGradUpdate& parent, const size_t rows, const size_t cols) :
-        parent(parent)
+        parent(parent),
+        iteration(0)
     {
       m.zeros(rows, cols);
       v.zeros(rows, cols);
@@ -111,7 +106,7 @@ class AMSGradUpdate
                 const GradType& gradient)
     {
       // Increment the iteration counter variable.
-      ++parent.iteration;
+      ++iteration;
 
       // And update the iterate.
       m *= parent.beta1;
@@ -120,10 +115,8 @@ class AMSGradUpdate
       v *= parent.beta2;
       v += (1 - parent.beta2) * (gradient % gradient);
 
-      const double biasCorrection1 = 1.0 - std::pow(parent.beta1,
-          parent.iteration);
-      const double biasCorrection2 = 1.0 - std::pow(parent.beta2,
-          parent.iteration);
+      const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration);
+      const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration);
 
       // Element wise maximum of past and present squared gradients.
       vImproved = arma::max(vImproved, v);
@@ -144,6 +137,9 @@ class AMSGradUpdate
 
     // The optimal squared gradient value.
     GradType vImproved;
+
+    // The number of iterations.
+    size_t iteration;
   };
 
  private:
@@ -155,9 +151,6 @@ class AMSGradUpdate
 
   // The second moment coefficient.
   double beta2;
-
-  // The number of iterations.
-  size_t iteration;
 };
 
 } // namespace ens
diff --git a/inst/include/ensmallen_bits/adam/nadam_update.hpp b/inst/include/ensmallen_bits/adam/nadam_update.hpp
index 880de5a..24f105c 100644
--- a/inst/include/ensmallen_bits/adam/nadam_update.hpp
+++ b/inst/include/ensmallen_bits/adam/nadam_update.hpp
@@ -50,8 +50,7 @@ class NadamUpdate
       epsilon(epsilon),
       beta1(beta1),
       beta2(beta2),
-      scheduleDecay(scheduleDecay),
-      iteration(0)
+      scheduleDecay(scheduleDecay)
   {
     // Nothing to do.
   }
@@ -76,11 +75,6 @@ class NadamUpdate
   //! Modify the decay parameter for decay coefficients
   double& ScheduleDecay() { return scheduleDecay; }
 
-  //! Get the current iteration number.
-  size_t Iteration() const { return iteration; }
-  //! Modify the current iteration number.
-  size_t& Iteration() { return iteration; }
-
   /**
    * The UpdatePolicyType policy classes must contain an internal 'Policy'
    * template class with two template arguments: MatType and GradType.  This is
@@ -101,7 +95,8 @@ class NadamUpdate
      */
     Policy(NadamUpdate& parent, const size_t rows, const size_t cols) :
         parent(parent),
-        cumBeta1(1)
+        cumBeta1(1),
+        iteration(0)
     {
       m.zeros(rows, cols);
       v.zeros(rows, cols);
@@ -119,7 +114,7 @@ class NadamUpdate
                 const GradType& gradient)
     {
       // Increment the iteration counter variable.
-      ++parent.iteration;
+      ++iteration;
 
       // And update the iterate.
       m *= parent.beta1;
@@ -129,18 +124,15 @@ class NadamUpdate
       v += (1 - parent.beta2) * gradient % gradient;
 
       double beta1T = parent.beta1 * (1 - (0.5 *
-          std::pow(0.96, parent.iteration * parent.scheduleDecay)));
+          std::pow(0.96, iteration * parent.scheduleDecay)));
 
       double beta1T1 = parent.beta1 * (1 - (0.5 *
-          std::pow(0.96, (parent.iteration + 1) * parent.scheduleDecay)));
+          std::pow(0.96, (iteration + 1) * parent.scheduleDecay)));
 
       cumBeta1 *= beta1T;
 
       const double biasCorrection1 = 1.0 - cumBeta1;
-
-      const double biasCorrection2 = 1.0 - std::pow(parent.beta2,
-          parent.iteration);
-
+      const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration);
       const double biasCorrection3 = 1.0 - (cumBeta1 * beta1T1);
 
       /* Note :- arma::sqrt(v) + epsilon * sqrt(biasCorrection2) is approximated
@@ -163,6 +155,9 @@ class NadamUpdate
 
     // The cumulative product of decay coefficients.
     double cumBeta1;
+
+    // The number of iterations.
+    size_t iteration;
   };
 
  private:
@@ -177,9 +172,6 @@ class NadamUpdate
 
   // The decay parameter for decay coefficients.
   double scheduleDecay;
-
-  // The number of iterations.
-  size_t iteration;
 };
 
 } // namespace ens
diff --git a/inst/include/ensmallen_bits/adam/nadamax_update.hpp b/inst/include/ensmallen_bits/adam/nadamax_update.hpp
index c5e72f8..f0d9b0c 100644
--- a/inst/include/ensmallen_bits/adam/nadamax_update.hpp
+++ b/inst/include/ensmallen_bits/adam/nadamax_update.hpp
@@ -50,8 +50,7 @@ class NadaMaxUpdate
       epsilon(epsilon),
       beta1(beta1),
       beta2(beta2),
-      scheduleDecay(scheduleDecay),
-      iteration(0)
+      scheduleDecay(scheduleDecay)
   {
     // Nothing to do.
   }
@@ -76,11 +75,6 @@ class NadaMaxUpdate
   //! Modify the decay parameter for decay coefficients
   double& ScheduleDecay() { return scheduleDecay; }
 
-  //! Get the current iteration number.
-  size_t Iteration() const { return iteration; }
-  //! Modify the current iteration number.
-  size_t& Iteration() { return iteration; }
-
   /**
    * The UpdatePolicyType policy classes must contain an internal 'Policy'
    * template class with two template arguments: MatType and GradType.  This is
@@ -101,7 +95,8 @@ class NadaMaxUpdate
      */
     Policy(NadaMaxUpdate& parent, const size_t rows, const size_t cols) :
         parent(parent),
-        cumBeta1(1)
+        cumBeta1(1),
+        iteration(0)
     {
       m.zeros(rows, cols);
       u.zeros(rows, cols);
@@ -119,7 +114,7 @@ class NadaMaxUpdate
                 const GradType& gradient)
     {
       // Increment the iteration counter variable.
-      ++parent.iteration;
+      ++iteration;
 
       // And update the iterate.
       m *= parent.beta1;
@@ -128,10 +123,10 @@ class NadaMaxUpdate
       u = arma::max(u * parent.beta2, arma::abs(gradient));
 
       double beta1T = parent.beta1 * (1 - (0.5 *
-          std::pow(0.96, parent.iteration * parent.scheduleDecay)));
+          std::pow(0.96, iteration * parent.scheduleDecay)));
 
       double beta1T1 = parent.beta1 * (1 - (0.5 *
-          std::pow(0.96, (parent.iteration + 1) * parent.scheduleDecay)));
+          std::pow(0.96, (iteration + 1) * parent.scheduleDecay)));
 
       cumBeta1 *= beta1T;
 
@@ -158,6 +153,9 @@ class NadaMaxUpdate
 
     // The cumulative product of decay coefficients.
     double cumBeta1;
+
+    // The number of iterations.
+    size_t iteration;
   };
 
  private:
@@ -172,9 +170,6 @@ class NadaMaxUpdate
 
   // The decay parameter for decay coefficients.
   double scheduleDecay;
-
-  // The number of iterations.
-  size_t iteration;
 };
 
 } // namespace ens
diff --git a/inst/include/ensmallen_bits/adam/optimisticadam_update.hpp b/inst/include/ensmallen_bits/adam/optimisticadam_update.hpp
index 7fa9fbb..426a5bb 100644
--- a/inst/include/ensmallen_bits/adam/optimisticadam_update.hpp
+++ b/inst/include/ensmallen_bits/adam/optimisticadam_update.hpp
@@ -51,8 +51,7 @@ class OptimisticAdamUpdate
                        const double beta2 = 0.999) :
     epsilon(epsilon),
     beta1(beta1),
-    beta2(beta2),
-    iteration(0)
+    beta2(beta2)
   {
     // Nothing to do.
   }
@@ -72,11 +71,6 @@ class OptimisticAdamUpdate
   //! Modify the second moment coefficient.
   double& Beta2() { return beta2; }
 
-  //! Get the current iteration number.
-  size_t Iteration() const { return iteration; }
-  //! Modify the current iteration number.
-  size_t& Iteration() { return iteration; }
-
   /**
    * The UpdatePolicyType policy classes must contain an internal 'Policy'
    * template class with two template arguments: MatType and GradType.  This is
@@ -96,7 +90,8 @@ class OptimisticAdamUpdate
      * @param cols Number of columns in the gradient matrix.
      */
     Policy(OptimisticAdamUpdate& parent, const size_t rows, const size_t cols) :
-        parent(parent)
+        parent(parent),
+        iteration(0)
     {
       m.zeros(rows, cols);
       v.zeros(rows, cols);
@@ -115,7 +110,7 @@ class OptimisticAdamUpdate
                 const GradType& gradient)
     {
       // Increment the iteration counter variable.
-      ++parent.iteration;
+      ++iteration;
 
       // And update the iterate.
       m *= parent.beta1;
@@ -124,13 +119,10 @@ class OptimisticAdamUpdate
       v *= parent.beta2;
       v += (1 - parent.beta2) * arma::square(gradient);
 
-      GradType mCorrected = m / (1.0 - std::pow(parent.beta1,
-          parent.iteration));
-      GradType vCorrected = v / (1.0 - std::pow(parent.beta2,
-          parent.iteration));
+      GradType mCorrected = m / (1.0 - std::pow(parent.beta1, iteration));
+      GradType vCorrected = v / (1.0 - std::pow(parent.beta2, iteration));
 
-      GradType update = mCorrected /
-          (arma::sqrt(vCorrected) + parent.epsilon);
+      GradType update = mCorrected / (arma::sqrt(vCorrected) + parent.epsilon);
 
       iterate -= (2 * stepSize * update - stepSize * g);
 
@@ -149,6 +141,9 @@ class OptimisticAdamUpdate
 
     // The previous update.
     GradType g;
+
+    // The number of iterations.
+    size_t iteration;
   };
 
  private:
@@ -160,9 +155,6 @@ class OptimisticAdamUpdate
 
   // The second moment coefficient.
   double beta2;
-
-  // The number of iterations.
-  size_t iteration;
 };
 
 } // namespace ens
diff --git a/inst/include/ensmallen_bits/demon_adam/demon_adam.hpp b/inst/include/ensmallen_bits/demon_adam/demon_adam.hpp
new file mode 100644
index 0000000..e524531
--- /dev/null
+++ b/inst/include/ensmallen_bits/demon_adam/demon_adam.hpp
@@ -0,0 +1,207 @@
+/**
+ * @file demon_adam.hpp
+ * @author Marcus Edel
+ *
+ * Definition of DemonAdam.
+ *
+ * ensmallen is free software; you may redistribute it and/or modify it under
+ * the terms of the 3-clause BSD license.  You should have received a copy of
+ * the 3-clause BSD license along with ensmallen.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+#ifndef ENSMALLEN_DEMON_ADAM_DEMON_ADAM_HPP
+#define ENSMALLEN_DEMON_ADAM_DEMON_ADAM_HPP
+
+#include "../sgd/sgd.hpp"
+#include "../adam/adam_update.hpp"
+#include "../adam/adamax_update.hpp"
+#include "../adam/amsgrad_update.hpp"
+#include "../adam/nadam_update.hpp"
+#include "../adam/nadamax_update.hpp"
+#include "../adam/optimisticadam_update.hpp"
+#include "demon_adam_update.hpp"
+
+namespace ens {
+
+/**
+ * DemonAdam automatically decays momentum, motivated by decaying the total
+ * contribution of a gradient to all future updates.
+ *
+ * For more information, see the following.
+ *
+ * @code
+ * @misc{
+ *   title   = {Decaying momentum helps neural network training},
+ *   author  = {John Chen and Cameron Wolfe and Zhao Li
+ *              and Anastasios Kyrillidis},
+ *   url     = {https://arxiv.org/abs/1910.04952}
+ *   year    = {2019}
+ * }
+ *
+ * DemonAdam can optimize differentiable separable functions. For more details,
+ * see the documentation on function types include with this distribution or on
+ * the ensmallen website.
+ *
+ * @tparam UpdateRule Adam optimizer update rule to be used.
+ */
+template<typename UpdateRule = AdamUpdate>
+class DemonAdamType
+{
+ public:
+  /**
+   * Construct the DemonAdam optimizer with the given function and parameters.
+   * The defaults here are not necessarily good for the given problem, so it is
+   * suggested that the values used be tailored to the task at hand.  The
+   * maximum number of iterations refers to the maximum number of points that
+   * are processed (i.e., one iteration equals one point; one iteration does not
+   * equal one pass over the dataset).
+   *
+   * @param stepSize Step size for each iteration.
+   * @param batchSize Number of points to process in a single step.
+   * @param momentum The initial momentum coefficient.
+   * @param maxIterations Maximum number of iterations allowed (0 means no
+   *     limit).
+   * @param beta1 Exponential decay rate for the first moment estimates.
+   * @param beta2 Exponential decay rate for the weighted infinity norm
+   *     estimates.
+   * @param eps Value used to initialise the mean squared gradient parameter.
+      * @param tolerance Maximum absolute tolerance to terminate algorithm.
+   * @param shuffle If true, the function order is shuffled; otherwise, each
+   *     function is visited in linear order.
+   * @param resetPolicy If true, parameters are reset before every Optimize
+   *     call; otherwise, their values are retained.
+   * @param exactObjective Calculate the exact objective (Default: estimate the
+   *     final objective obtained on the last pass over the data).
+   */
+  DemonAdamType(const double stepSize = 0.001,
+                const size_t batchSize = 32,
+                const double momentum = 0.9,
+                const double beta1 = 0.9,
+                const double beta2 = 0.999,
+                const double eps = 1e-8,
+                const size_t maxIterations = 100000,
+                const double tolerance = 1e-5,
+                const bool shuffle = true,
+                const bool resetPolicy = true,
+                const bool exactObjective = false) :
+      optimizer(stepSize,
+                batchSize,
+                maxIterations,
+                tolerance,
+                shuffle,
+                DemonAdamUpdate<UpdateRule>(maxIterations * batchSize,
+                    momentum, UpdateRule(eps, beta1, beta2)),
+                NoDecay(),
+                resetPolicy,
+                exactObjective)
+  { /* Nothing to do here. */ }
+
+  /**
+   * Optimize the given function using DemonAdam. The given starting point will
+   * be modified to store the finishing point of the algorithm, and the final
+   * objective value is returned.
+   *
+   * @tparam SeparableFunctionType Type of the function to optimize.
+   * @tparam MatType Type of matrix to optimize with.
+   * @tparam GradType Type of matrix to use to represent function gradients.
+   * @tparam CallbackTypes Types of callback functions.
+   * @param function Function to optimize.
+   * @param iterate Starting point (will be modified).
+   * @param callbacks Callback functions.
+   * @return Objective value of the final point.
+   */
+  template<typename SeparableFunctionType,
+           typename MatType,
+           typename GradType,
+           typename... CallbackTypes>
+  typename MatType::elem_type Optimize(SeparableFunctionType& function,
+                                       MatType& iterate,
+                                       CallbackTypes&&... callbacks)
+  {
+    return optimizer.template Optimize<
+        SeparableFunctionType, MatType, GradType, CallbackTypes...>(
+        function, iterate, std::forward<CallbackTypes>(callbacks)...);
+  }
+
+  //! Forward the MatType as GradType.
+  template<typename SeparableFunctionType,
+           typename MatType,
+           typename... CallbackTypes>
+  typename MatType::elem_type Optimize(SeparableFunctionType& function,
+                                       MatType& iterate,
+                                       CallbackTypes&&... callbacks)
+  {
+    return Optimize<SeparableFunctionType, MatType, MatType,
+        CallbackTypes...>(function, iterate,
+        std::forward<CallbackTypes>(callbacks)...);
+  }
+
+  //! Get the step size.
+  double StepSize() const { return optimizer.StepSize(); }
+  //! Modify the step size.
+  double& StepSize() { return optimizer.StepSize(); }
+
+  //! Get the batch size.
+  size_t BatchSize() const { return optimizer.BatchSize(); }
+  //! Modify the batch size.
+  size_t& BatchSize() { return optimizer.BatchSize(); }
+
+  //! Get the moment coefficient.
+  double Momentum() const { return optimizer.UpdatePolicy().Momentum(); }
+  //! Modify the moment coefficient.
+  double& Momentum() { return optimizer.UpdatePolicy().Momentum(); }
+
+  //! Get the momentum iteration number.
+  size_t MomentumIterations() const
+  { return optimizer.UpdatePolicy().MomentumIterations(); }
+  //! Modify the momentum iteration number.
+  size_t& MomentumIterations()
+  { return optimizer.UpdatePolicy().MomentumIterations(); }
+
+  //! Get the maximum number of iterations (0 indicates no limit).
+  size_t MaxIterations() const { return optimizer.MaxIterations(); }
+  //! Modify the maximum number of iterations (0 indicates no limit).
+  size_t& MaxIterations() { return optimizer.MaxIterations(); }
+
+  //! Get the tolerance for termination.
+  double Tolerance() const { return optimizer.Tolerance(); }
+  //! Modify the tolerance for termination.
+  double& Tolerance() { return optimizer.Tolerance(); }
+
+  //! Get whether or not the individual functions are shuffled.
+  bool Shuffle() const { return optimizer.Shuffle(); }
+  //! Modify whether or not the individual functions are shuffled.
+  bool& Shuffle() { return optimizer.Shuffle(); }
+
+  //! Get whether or not the actual objective is calculated.
+  bool ExactObjective() const { return optimizer.ExactObjective(); }
+  //! Modify whether or not the actual objective is calculated.
+  bool& ExactObjective() { return optimizer.ExactObjective(); }
+
+  //! Get whether or not the update policy parameters
+  //! are reset before Optimize call.
+  bool ResetPolicy() const { return optimizer.ResetPolicy(); }
+  //! Modify whether or not the update policy parameters
+  //! are reset before Optimize call.
+  bool& ResetPolicy() { return optimizer.ResetPolicy(); }
+
+ private:
+  //! The Stochastic Gradient Descent object with DemonAdam policy.
+  SGD<DemonAdamUpdate<UpdateRule>> optimizer;
+};
+
+using DemonAdam = DemonAdamType<AdamUpdate>;
+
+using DemonAdaMax = DemonAdamType<AdaMaxUpdate>;
+
+using DemonAMSGrad = DemonAdamType<AMSGradUpdate>;
+
+using DemonNadam = DemonAdamType<NadamUpdate>;
+
+using DemonNadaMax = DemonAdamType<NadaMaxUpdate>;
+
+using DemonOptimisticAdam = DemonAdamType<OptimisticAdamUpdate>;
+
+} // namespace ens
+
+#endif
diff --git a/inst/include/ensmallen_bits/demon_adam/demon_adam_update.hpp b/inst/include/ensmallen_bits/demon_adam/demon_adam_update.hpp
new file mode 100644
index 0000000..47f6b36
--- /dev/null
+++ b/inst/include/ensmallen_bits/demon_adam/demon_adam_update.hpp
@@ -0,0 +1,169 @@
+/**
+ * @file demon_sgd_update.hpp
+ * @author Marcus Edel
+ *
+ * Implementation of DemonAdam.
+ *
+ * ensmallen is free software; you may redistribute it and/or modify it under
+ * the terms of the 3-clause BSD license.  You should have received a copy of
+ * the 3-clause BSD license along with ensmallen.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+#ifndef ENSMALLEN_DEMON_ADAM_DEMON_ADAM_UPDATE_HPP
+#define ENSMALLEN_DEMON_ADAM_DEMON_ADAM_UPDATE_HPP
+
+#include <assert.h>
+
+namespace ens {
+
+/**
+ * DemonAdam automatically decays momentum, motivated by decaying the total
+ * contribution of a gradient to all future updates.
+ *
+ * For more information, see the following.
+ *
+ * @code
+ * @misc{
+ *   title   = {Decaying momentum helps neural network training},
+ *   author  = {John Chen and Cameron Wolfe and Zhao Li
+ *              and Anastasios Kyrillidis},
+ *   url     = {https://arxiv.org/abs/1910.04952}
+ *   year    = {2019}
+ * }
+ * @endcode
+ *
+ * @tparam UpdateRule DemonAdam optimizer update rule to be used.
+ */
+template<typename UpdateRule>
+class DemonAdamUpdate
+{
+ public:
+  /**
+   * Construct the DemonAdam update policy with the given parameters.
+   *
+   * @param momentumIterations The number of iterations before the momentum
+   *     will decay to zero.
+   * @param momentum The initial momentum coefficient.
+   * @param adamUpdate Instantiated Adam update policy used to adjust the given
+   *     parameters.
+   */
+  DemonAdamUpdate(const size_t momentumIterations = 100,
+                  const double momentum = 0.9,
+                  const UpdateRule& adamUpdate = UpdateRule()) :
+      T(momentumIterations),
+      betaInit(momentum),
+      t(0),
+      adamUpdateInst(adamUpdate)
+  {
+    // Make sure the momentum iterations parameter is non-zero.
+    assert(momentumIterations != 0 && "The number of iterations before the "
+        "momentum will decay is zero, make sure the max iterations and "
+        "batch size parameter is set correctly. "
+        "Default: momentumIterations = maxIterations / batchSize.");
+  }
+
+  //! Get the momentum coefficient.
+  double Momentum() const { return betaInit; }
+  //! Modify the momentum coefficient.
+  double& Momentum() { return betaInit; }
+
+  //! Get the current iteration number.
+  size_t Iteration() const { return t; }
+  //! Modify the current iteration number.
+  size_t& Iteration() { return t; }
+
+  //! Get the momentum ion number.
+  size_t MomentumIterations() const { return T; }
+  //! Modify the momentum iteration number.
+  size_t& MomentumIterations() { return T; }
+
+  /**
+   * The UpdatePolicyType policy classes must contain an internal 'Policy'
+   * template class with two template arguments: MatType and GradType.  This is
+   * instantiated at the start of the optimization, and holds parameters
+   * specific to an individual optimization.
+   */
+  template<typename MatType, typename GradType>
+  class Policy
+  {
+   public:
+    // Convenient typedef.
+    typedef typename UpdateRule::template Policy<MatType, GradType>
+        InstUpdateRuleType;
+
+    /**
+     * This constructor is called by the SGD Optimize() method before the start
+     * of the iteration update process.
+     *
+     * @param parent Instantiated PadamUpdate parent object.
+     * @param rows Number of rows in the gradient matrix.
+     * @param cols Number of columns in the gradient matrix.
+     */
+    Policy(DemonAdamUpdate& parent,
+           const size_t rows,
+           const size_t cols) :
+      parent(parent),
+      adamUpdate(new InstUpdateRuleType(parent.adamUpdateInst, rows, cols))
+    { /* Nothing to do here */ }
+
+    /**
+     * Clean any memory associated with the Polciy object.
+     */
+    ~Policy()
+    {
+      delete adamUpdate;
+    }
+
+    /**
+     * Update step for DamonAdam.
+     *
+     * @param iterate Parameters that minimize the function.
+     * @param stepSize Step size to be used for the given iteration.
+     * @param gradient The gradient matrix.
+     */
+    void Update(MatType& iterate,
+                const double stepSize,
+                const GradType& gradient)
+    {
+      double decayRate = 1;
+      if (parent.t > 0)
+        decayRate = 1.0 - (double) parent.t / (double) parent.T;
+
+      const double betaDecay = parent.betaInit * decayRate;
+      const double beta = betaDecay / ((1.0 - parent.betaInit) + betaDecay);
+
+      // Perform the update.
+      iterate *= beta;
+
+      // Apply the adam update.
+      adamUpdate->Update(iterate, stepSize, gradient);
+
+      // Increment the iteration counter variable.
+      ++parent.t;
+    }
+
+   private:
+    //! Instantiated parent object.
+    DemonAdamUpdate<UpdateRule>& parent;
+
+    //! The update policy.
+    InstUpdateRuleType* adamUpdate;
+  };
+
+ private:
+  //! The number of momentum iterations.
+  size_t T;
+
+  //! Initial momentum coefficient.
+  double betaInit;
+
+  //! The number of iterations.
+  size_t t;
+
+  //! The adam update policy.
+  UpdateRule adamUpdateInst;
+};
+
+} // namespace ens
+
+#endif
diff --git a/inst/include/ensmallen_bits/demon_sgd/demon_sgd.hpp b/inst/include/ensmallen_bits/demon_sgd/demon_sgd.hpp
new file mode 100644
index 0000000..ddbf1d2
--- /dev/null
+++ b/inst/include/ensmallen_bits/demon_sgd/demon_sgd.hpp
@@ -0,0 +1,178 @@
+/**
+ * @file demon_sgd.hpp
+ * @author Marcus Edel
+ *
+ * Definition of DemonSGD.
+ *
+ * ensmallen is free software; you may redistribute it and/or modify it under
+ * the terms of the 3-clause BSD license.  You should have received a copy of
+ * the 3-clause BSD license along with ensmallen.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+#ifndef ENSMALLEN_DEMON_SGD_DEMON_SGD_HPP
+#define ENSMALLEN_DEMON_SGD_DEMON_SGD_HPP
+
+#include "../sgd/sgd.hpp"
+#include "demon_sgd_update.hpp"
+
+namespace ens {
+
+/**
+ * DemonSGD automatically decays momentum, motivated by decaying the total
+ * contribution of a gradient to all future updates.
+ *
+ * For more information, see the following.
+ *
+ * @code
+ * @misc{
+ *   title   = {Decaying momentum helps neural network training},
+ *   author  = {John Chen and Cameron Wolfe and Zhao Li
+ *              and Anastasios Kyrillidis},
+ *   url     = {https://arxiv.org/abs/1910.04952}
+ *   year    = {2019}
+ * }
+ *
+ * DemonSGD can optimize differentiable separable functions. For more details,
+ * see the documentation on function types include with this distribution or on
+ * the ensmallen website.
+ */
+class DemonSGD
+{
+ public:
+  /**
+   * Construct the DemonSGD optimizer with the given function and parameters.
+   * The defaults here are not necessarily good for the given problem, so it is
+   * suggested that the values used be tailored to the task at hand.  The
+   * maximum number of iterations refers to the maximum number of points that
+   * are processed (i.e., one iteration equals one point; one iteration does not
+   * equal one pass over the dataset).
+   *
+   * @param stepSize Step size for each iteration.
+   * @param batchSize Number of points to process in a single step.
+   * @param momentum The initial momentum coefficient.
+   * @param maxIterations Maximum number of iterations allowed (0 means no
+   *     limit).
+   * @param tolerance Maximum absolute tolerance to terminate algorithm.
+   * @param shuffle If true, the function order is shuffled; otherwise, each
+   *     function is visited in linear order.
+   * @param resetPolicy If true, parameters are reset before every Optimize
+   *     call; otherwise, their values are retained.
+   * @param exactObjective Calculate the exact objective (Default: estimate the
+   *     final objective obtained on the last pass over the data).
+   */
+  DemonSGD(const double stepSize = 0.001,
+           const size_t batchSize = 32,
+           const double momentum = 0.9,
+           const size_t maxIterations = 100000,
+           const double tolerance = 1e-5,
+           const bool shuffle = true,
+           const bool resetPolicy = true,
+           const bool exactObjective = false) :
+      optimizer(stepSize,
+                batchSize,
+                maxIterations,
+                tolerance,
+                shuffle,
+                DemonSGDUpdate(maxIterations * batchSize, momentum),
+                NoDecay(),
+                resetPolicy,
+                exactObjective)
+  { /* Nothing to do here. */ }
+
+  /**
+   * Optimize the given function using DemonSGD. The given starting point will
+   * be modified to store the finishing point of the algorithm, and the final
+   * objective value is returned.
+   *
+   * @tparam SeparableFunctionType Type of the function to optimize.
+   * @tparam MatType Type of matrix to optimize with.
+   * @tparam GradType Type of matrix to use to represent function gradients.
+   * @tparam CallbackTypes Types of callback functions.
+   * @param function Function to optimize.
+   * @param iterate Starting point (will be modified).
+   * @param callbacks Callback functions.
+   * @return Objective value of the final point.
+   */
+  template<typename SeparableFunctionType,
+           typename MatType,
+           typename GradType,
+           typename... CallbackTypes>
+  typename MatType::elem_type Optimize(SeparableFunctionType& function,
+                                       MatType& iterate,
+                                       CallbackTypes&&... callbacks)
+  {
+    return optimizer.template Optimize<
+        SeparableFunctionType, MatType, GradType, CallbackTypes...>(
+        function, iterate, std::forward<CallbackTypes>(callbacks)...);
+  }
+
+  //! Forward the MatType as GradType.
+  template<typename SeparableFunctionType,
+           typename MatType,
+           typename... CallbackTypes>
+  typename MatType::elem_type Optimize(SeparableFunctionType& function,
+                                       MatType& iterate,
+                                       CallbackTypes&&... callbacks)
+  {
+    return Optimize<SeparableFunctionType, MatType, MatType,
+        CallbackTypes...>(function, iterate,
+        std::forward<CallbackTypes>(callbacks)...);
+  }
+
+  //! Get the step size.
+  double StepSize() const { return optimizer.StepSize(); }
+  //! Modify the step size.
+  double& StepSize() { return optimizer.StepSize(); }
+
+  //! Get the batch size.
+  size_t BatchSize() const { return optimizer.BatchSize(); }
+  //! Modify the batch size.
+  size_t& BatchSize() { return optimizer.BatchSize(); }
+
+  //! Get the moment coefficient.
+  double Momentum() const { return optimizer.UpdatePolicy().Momentum(); }
+  //! Modify the moment coefficient.
+  double& Momentum() { return optimizer.UpdatePolicy().Momentum(); }
+
+  //! Get the momentum iteration number.
+  size_t MomentumIterations() const
+  { return optimizer.UpdatePolicy().MomentumIterations(); }
+  //! Modify the momentum iteration number.
+  size_t& MomentumIterations()
+  { return optimizer.UpdatePolicy().MomentumIterations(); }
+
+  //! Get the maximum number of iterations (0 indicates no limit).
+  size_t MaxIterations() const { return optimizer.MaxIterations(); }
+  //! Modify the maximum number of iterations (0 indicates no limit).
+  size_t& MaxIterations() { return optimizer.MaxIterations(); }
+
+  //! Get the tolerance for termination.
+  double Tolerance() const { return optimizer.Tolerance(); }
+  //! Modify the tolerance for termination.
+  double& Tolerance() { return optimizer.Tolerance(); }
+
+  //! Get whether or not the individual functions are shuffled.
+  bool Shuffle() const { return optimizer.Shuffle(); }
+  //! Modify whether or not the individual functions are shuffled.
+  bool& Shuffle() { return optimizer.Shuffle(); }
+
+  //! Get whether or not the actual objective is calculated.
+  bool ExactObjective() const { return optimizer.ExactObjective(); }
+  //! Modify whether or not the actual objective is calculated.
+  bool& ExactObjective() { return optimizer.ExactObjective(); }
+
+  //! Get whether or not the update policy parameters
+  //! are reset before Optimize call.
+  bool ResetPolicy() const { return optimizer.ResetPolicy(); }
+  //! Modify whether or not the update policy parameters
+  //! are reset before Optimize call.
+  bool& ResetPolicy() { return optimizer.ResetPolicy(); }
+
+ private:
+  //! The Stochastic Gradient Descent object with DemonSGD policy.
+  SGD<DemonSGDUpdate> optimizer;
+};
+
+} // namespace ens
+
+#endif
diff --git a/inst/include/ensmallen_bits/demon_sgd/demon_sgd_update.hpp b/inst/include/ensmallen_bits/demon_sgd/demon_sgd_update.hpp
new file mode 100644
index 0000000..dc8b7c5
--- /dev/null
+++ b/inst/include/ensmallen_bits/demon_sgd/demon_sgd_update.hpp
@@ -0,0 +1,139 @@
+/**
+ * @file demon_sgd_update.hpp
+ * @author Marcus Edel
+ *
+ * Implementation of DemonSGD.
+ *
+ * ensmallen is free software; you may redistribute it and/or modify it under
+ * the terms of the 3-clause BSD license.  You should have received a copy of
+ * the 3-clause BSD license along with ensmallen.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+#ifndef ENSMALLEN_DEMON_SGD_DEMON_SGD_UPDATE_HPP
+#define ENSMALLEN_DEMON_SGD_DEMON_SGD_UPDATE_HPP
+
+namespace ens {
+
+/**
+ * DemonSGD automatically decays momentum, motivated by decaying the total
+ * contribution of a gradient to all future updates.
+ *
+ * For more information, see the following.
+ *
+ * @code
+ * @misc{
+ *   title   = {Decaying momentum helps neural network training},
+ *   author  = {John Chen and Cameron Wolfe and Zhao Li
+ *              and Anastasios Kyrillidis},
+ *   url     = {https://arxiv.org/abs/1910.04952}
+ *   year    = {2019}
+ * }
+ * @endcode
+ */
+class DemonSGDUpdate
+{
+ public:
+  /**
+   * Construct the DemonSGD update policy with the given parameters.
+   *
+   * @param momentumIterations The number of iterations before the momentum
+   *     will decay to zero.
+   * @param momentum The initial momentum coefficient.
+   */
+  DemonSGDUpdate(const size_t momentumIterations = 100,
+                 const double momentum = 0.9) :
+      T(momentumIterations),
+      betaInit(momentum),
+      t(0)
+  {
+    // Make sure the momentum iterations parameter is non-zero.
+    assert(momentumIterations != 0 && "The number of iterations before the "
+        "momentum will decay is zero, make sure the max iterations and "
+        "batch size parameter is set correctly. "
+        "Default: momentumIterations = maxIterations * batchSize.");
+  }
+
+  //! Get the momentum coefficient.
+  double Momentum() const { return betaInit; }
+  //! Modify the momentum coefficient.
+  double& Momentum() { return betaInit; }
+
+  //! Get the current iteration number.
+  size_t Iteration() const { return t; }
+  //! Modify the current iteration number.
+  size_t& Iteration() { return t; }
+
+  //! Get the momentum iteration number.
+  size_t MomentumIterations() const { return T; }
+  //! Modify the momentum iteration number.
+  size_t& MomentumIterations() { return T; }
+
+  /**
+   * The UpdatePolicyType policy classes must contain an internal 'Policy'
+   * template class with two template arguments: MatType and GradType.  This is
+   * instantiated at the start of the optimization, and holds parameters
+   * specific to an individual optimization.
+   */
+  template<typename MatType, typename GradType>
+  class Policy
+  {
+   public:
+    /**
+     * This constructor is called by the SGD Optimize() method before the start
+     * of the iteration update process.
+     *
+     * @param parent Instantiated PadamUpdate parent object.
+     * @param rows Number of rows in the gradient matrix.
+     * @param cols Number of columns in the gradient matrix.
+     */
+    Policy(DemonSGDUpdate& parent,
+           const size_t /* rows */,
+           const size_t /* cols */) :
+      parent(parent)
+    { /* Nothing to do here */ }
+
+    /**
+     * Update step for DemonSGD.
+     *
+     * @param iterate Parameters that minimize the function.
+     * @param stepSize Step size to be used for the given iteration.
+     * @param gradient The gradient matrix.
+     */
+    void Update(MatType& iterate,
+                const double stepSize,
+                const GradType& gradient)
+    {
+      double decayRate = 1;
+      if (parent.t > 0)
+        decayRate = 1.0 - (double) parent.t / (double) parent.T;
+
+      const double betaDecay = parent.betaInit * decayRate;
+      const double beta = betaDecay / ((1.0 - parent.betaInit) + betaDecay);
+
+      // Perform the update.
+      iterate *= beta;
+      iterate -= stepSize * gradient;
+
+      // Increment the iteration counter variable.
+      ++parent.t;
+    }
+
+   private:
+    //! Instantiated parent object.
+    DemonSGDUpdate& parent;
+  };
+
+ private:
+  //! The number of momentum iterations.
+  size_t T;
+
+  //! Initial momentum coefficient.
+  double betaInit;
+
+  //! The number of iterations.
+  size_t t;
+};
+
+} // namespace ens
+
+#endif
diff --git a/inst/include/ensmallen_bits/ens_version.hpp b/inst/include/ensmallen_bits/ens_version.hpp
index 530d64c..185ed3f 100644
--- a/inst/include/ensmallen_bits/ens_version.hpp
+++ b/inst/include/ensmallen_bits/ens_version.hpp
@@ -15,17 +15,17 @@
 #define ENS_VERSION_MAJOR 2
 // The minor version is two digits so regular numerical comparisons of versions
 // work right.  The first minor version of a release is always 10.
-#define ENS_VERSION_MINOR 18
-#define ENS_VERSION_PATCH 2
+#define ENS_VERSION_MINOR 19
+#define ENS_VERSION_PATCH 0
 // If this is a release candidate, it will be reflected in the version name
 // (i.e. the version name will be "RC1", "RC2", etc.).  Otherwise the version
 // name will typically be a seemingly arbitrary set of words that does not
 // contain the capitalized string "RC".
-#define ENS_VERSION_NAME "Fairmount Bagel"
+#define ENS_VERSION_NAME "Eight Ball Deluxe"
 // Incorporate the date the version was released.
 #define ENS_VERSION_YEAR "2022"
-#define ENS_VERSION_MONTH "02"
-#define ENS_VERSION_DAY "13"
+#define ENS_VERSION_MONTH "04"
+#define ENS_VERSION_DAY "06"
 
 namespace ens {
 
diff --git a/inst/include/ensmallen_bits/ftml/ftml_update.hpp b/inst/include/ensmallen_bits/ftml/ftml_update.hpp
index 7a5ca1c..5db2b05 100644
--- a/inst/include/ensmallen_bits/ftml/ftml_update.hpp
+++ b/inst/include/ensmallen_bits/ftml/ftml_update.hpp
@@ -50,8 +50,7 @@ class FTMLUpdate
              const double beta2 = 0.999) :
       epsilon(epsilon),
       beta1(beta1),
-      beta2(beta2),
-      iteration(0)
+      beta2(beta2)
   { /* Do nothing. */ }
 
   //! Get the value used to initialise the squared gradient parameter.
@@ -69,11 +68,6 @@ class FTMLUpdate
   //! Modify the second moment coefficient.
   double& Beta2() { return beta2; }
 
-  //! Get the current iteration number.
-  size_t Iteration() const { return iteration; }
-  //! Modify the current iteration number.
-  size_t& Iteration() { return iteration; }
-
   /**
    * The UpdatePolicyType policy classes must contain an internal 'Policy'
    * template class with two template arguments: MatType and GradType.  This is
@@ -112,16 +106,14 @@ class FTMLUpdate
                 const GradType& gradient)
     {
       // Increment the iteration counter variable.
-      ++parent.iteration;
+      ++iteration;
 
       // And update the iterate.
       v *= parent.beta2;
       v += (1 - parent.beta2) * (gradient % gradient);
 
-      const double biasCorrection1 = 1.0 - std::pow(parent.beta1,
-          parent.iteration);
-      const double biasCorrection2 = 1.0 - std::pow(parent.beta2,
-          parent.iteration);
+      const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration);
+      const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration);
 
       MatType sigma = -parent.beta1 * d;
       d = biasCorrection1 / stepSize *
@@ -145,6 +137,9 @@ class FTMLUpdate
 
     // Parameter update term.
     MatType d;
+
+    // The number of iterations.
+    size_t iteration;
   };
 
  private:
@@ -156,9 +151,6 @@ class FTMLUpdate
 
   // The second moment coefficient.
   double beta2;
-
-  // The number of iterations.
-  size_t iteration;
 };
 
 } // namespace ens
diff --git a/inst/include/ensmallen_bits/fw/proximal/proximal_impl.hpp b/inst/include/ensmallen_bits/fw/proximal/proximal_impl.hpp
index 289f006..f607c71 100644
--- a/inst/include/ensmallen_bits/fw/proximal/proximal_impl.hpp
+++ b/inst/include/ensmallen_bits/fw/proximal/proximal_impl.hpp
@@ -53,7 +53,7 @@ inline void Proximal::ProjectToL1Ball(MatType& v, double tau)
     if (nu > 0)
       break;
   }
-  double theta = (simplexSum(rho) - tau) / rho;
+  const double theta = (simplexSum(rho) - tau) / rho;
 
   // Threshold on absolute value of v with theta.
   for (arma::uword j = 0; j < simplexSol.n_rows; j++)
diff --git a/inst/include/ensmallen_bits/lbfgs/lbfgs_impl.hpp b/inst/include/ensmallen_bits/lbfgs/lbfgs_impl.hpp
index fbcad95..28c1552 100644
--- a/inst/include/ensmallen_bits/lbfgs/lbfgs_impl.hpp
+++ b/inst/include/ensmallen_bits/lbfgs/lbfgs_impl.hpp
@@ -79,7 +79,7 @@ double L_BFGS::ChooseScalingFactor(const size_t iterationNum,
 {
   typedef typename CubeType::elem_type CubeElemType;
 
-  double scalingFactor = 1.0;
+  double scalingFactor;
   if (iterationNum > 0)
   {
     int previousPos = (iterationNum - 1) % numBasis;
@@ -378,7 +378,7 @@ L_BFGS::Optimize(FunctionType& function,
   terminate |= Callback::EvaluateWithGradient(*this, f, iterate,
         functionValue, gradient, callbacks...);
 
-  ElemType prevFunctionValue = functionValue;
+  ElemType prevFunctionValue;
 
   // The main optimization loop.
   terminate |= Callback::BeginOptimization(*this, f, iterate, callbacks...);
diff --git a/inst/include/ensmallen_bits/padam/padam_update.hpp b/inst/include/ensmallen_bits/padam/padam_update.hpp
index 570b977..a4a6924 100644
--- a/inst/include/ensmallen_bits/padam/padam_update.hpp
+++ b/inst/include/ensmallen_bits/padam/padam_update.hpp
@@ -50,8 +50,7 @@ class PadamUpdate
       epsilon(epsilon),
       beta1(beta1),
       beta2(beta2),
-      partial(partial),
-      iteration(0)
+      partial(partial)
   {
     // Nothing to do.
   }
@@ -76,11 +75,6 @@ class PadamUpdate
   //! Modify the partial adaptive parameter.
   double& Partial() { return partial; }
 
-  //! Get the current iteration number.
-  size_t Iteration() const { return iteration; }
-  //! Modify the current iteration number.
-  size_t& Iteration() { return iteration; }
-
   /**
    * The UpdatePolicyType policy classes must contain an internal 'Policy'
    * template class with two template arguments: MatType and GradType.  This is
@@ -100,7 +94,8 @@ class PadamUpdate
      * @param cols Number of columns in the gradient matrix.
      */
     Policy(PadamUpdate& parent, const size_t rows, const size_t cols) :
-        parent(parent)
+        parent(parent),
+        iteration(0)
     {
       m.zeros(rows, cols);
       v.zeros(rows, cols);
@@ -119,7 +114,7 @@ class PadamUpdate
                 const GradType& gradient)
     {
       // Increment the iteration counter variable.
-      ++parent.iteration;
+      ++iteration;
 
       // And update the iterate.
       m *= parent.beta1;
@@ -128,10 +123,8 @@ class PadamUpdate
       v *= parent.beta2;
       v += (1 - parent.beta2) * (gradient % gradient);
 
-      const double biasCorrection1 = 1.0 - std::pow(parent.beta1,
-          parent.iteration);
-      const double biasCorrection2 = 1.0 - std::pow(parent.beta2,
-          parent.iteration);
+      const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration);
+      const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration);
 
       // Element wise maximum of past and present squared gradients.
       vImproved = arma::max(vImproved, v);
@@ -152,6 +145,9 @@ class PadamUpdate
 
     //! The optimal sqaured gradient value.
     GradType vImproved;
+
+    //! The number of iterations.
+    size_t iteration;
   };
 
  private:
@@ -166,9 +162,6 @@ class PadamUpdate
 
   //! Partial adaptive parameter.
   double partial;
-
-  //! The number of iterations.
-  size_t iteration;
 };
 
 } // namespace ens
diff --git a/inst/include/ensmallen_bits/qhadam/qhadam_update.hpp b/inst/include/ensmallen_bits/qhadam/qhadam_update.hpp
index 8540033..f408377 100644
--- a/inst/include/ensmallen_bits/qhadam/qhadam_update.hpp
+++ b/inst/include/ensmallen_bits/qhadam/qhadam_update.hpp
@@ -54,8 +54,7 @@ class QHAdamUpdate
     beta1(beta1),
     beta2(beta2),
     v1(v1),
-    v2(v2),
-    iteration(0)
+    v2(v2)
   {
     // Nothing to do.
   }
@@ -75,11 +74,6 @@ class QHAdamUpdate
   //! Modify the second moment coefficient.
   double& Beta2() { return beta2; }
 
-  //! Get the current iteration number.
-  size_t Iteration() const { return iteration; }
-  //! Modify the current iteration number.
-  size_t& Iteration() { return iteration; }
-
   //! Get the first quasi-hyperbolic term.
   double V1() const { return v1; }
   //! Modify the first quasi-hyperbolic term.
@@ -109,7 +103,8 @@ class QHAdamUpdate
      * @param cols Number of columns in the gradient matrix.
      */
     Policy(QHAdamUpdate& parent, const size_t rows, const size_t cols) :
-        parent(parent)
+        parent(parent),
+        iteration(0)
     {
       m.zeros(rows, cols);
       v.zeros(rows, cols);
@@ -127,7 +122,7 @@ class QHAdamUpdate
                 const GradType& gradient)
     {
       // Increment the iteration counter variable.
-      ++parent.iteration;
+      ++iteration;
 
       // And update the iterate.
       m *= parent.beta1;
@@ -136,10 +131,8 @@ class QHAdamUpdate
       v *= parent.beta2;
       v += (1 - parent.beta2) * (gradient % gradient);
 
-      const double biasCorrection1 = 1.0 - std::pow(parent.beta1,
-          parent.iteration);
-      const double biasCorrection2 = 1.0 - std::pow(parent.beta2,
-          parent.iteration);
+      const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration);
+      const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration);
 
       GradType mDash = m / biasCorrection1;
       GradType vDash = v / biasCorrection2;
@@ -160,6 +153,9 @@ class QHAdamUpdate
 
     // The exponential moving average of squared gradient values.
     GradType v;
+
+    // The number of iterations.
+    size_t iteration;
   };
 
  private:
@@ -177,9 +173,6 @@ class QHAdamUpdate
 
   // The second quasi-hyperbolic term.
   double v2;
-
-  // The number of iterations.
-  size_t iteration;
 };
 
 } // namespace ens
diff --git a/inst/include/ensmallen_bits/sa/sa_impl.hpp b/inst/include/ensmallen_bits/sa/sa_impl.hpp
index 978b59e..f680f99 100644
--- a/inst/include/ensmallen_bits/sa/sa_impl.hpp
+++ b/inst/include/ensmallen_bits/sa/sa_impl.hpp
@@ -70,7 +70,7 @@ typename MatType::elem_type SA<CoolingScheduleType>::Optimize(
   ElemType energy = function.Evaluate(iterate);
   Callback::Evaluate(*this, function, iterate, energy, callbacks...);
 
-  ElemType oldEnergy = energy;
+  ElemType oldEnergy;
 
   size_t idx = 0;
   size_t sweepCounter = 0;
diff --git a/inst/include/ensmallen_bits/spalera_sgd/spalera_sgd_impl.hpp b/inst/include/ensmallen_bits/spalera_sgd/spalera_sgd_impl.hpp
index 7fa8175..4043401 100644
--- a/inst/include/ensmallen_bits/spalera_sgd/spalera_sgd_impl.hpp
+++ b/inst/include/ensmallen_bits/spalera_sgd/spalera_sgd_impl.hpp
@@ -178,7 +178,6 @@ SPALeRASGD<DecayPolicyType>::Optimize(
     i += effectiveBatchSize;
     currentFunction += effectiveBatchSize;
     overallObjective += currentObjective;
-    currentObjective /= effectiveBatchSize;
 
     // Is this iteration the start of a sequence?
     if ((currentFunction % numFunctions) == 0)
diff --git a/inst/include/ensmallen_bits/swats/swats_update.hpp b/inst/include/ensmallen_bits/swats/swats_update.hpp
index 18f1524..0a80a77 100644
--- a/inst/include/ensmallen_bits/swats/swats_update.hpp
+++ b/inst/include/ensmallen_bits/swats/swats_update.hpp
@@ -50,7 +50,6 @@ class SWATSUpdate
     epsilon(epsilon),
     beta1(beta1),
     beta2(beta2),
-    iteration(0),
     phaseSGD(false),
     sgdRate(0),
     sgdLambda(0)
@@ -73,11 +72,6 @@ class SWATSUpdate
   //! Modify the second moment coefficient.
   double& Beta2() { return beta2; }
 
-  //! Get the current iteration number.
-  size_t Iteration() const { return iteration; }
-  //! Modify the current iteration number.
-  size_t& Iteration() { return iteration; }
-
   //! Get whether the current phase is SGD.
   bool PhaseSGD() const { return phaseSGD; }
   //! Modify whether the current phase is SGD.
@@ -111,7 +105,8 @@ class SWATSUpdate
      * @param cols Number of columns in the gradient matrix.
      */
     Policy(SWATSUpdate& parent, const size_t rows, const size_t cols) :
-        parent(parent)
+        parent(parent),
+        iteration(0)
     {
       m.zeros(rows, cols);
       v.zeros(rows, cols);
@@ -131,7 +126,7 @@ class SWATSUpdate
                 const GradType& gradient)
     {
       // Increment the iteration counter variable.
-      ++parent.iteration;
+      ++iteration;
 
       if (parent.phaseSGD)
       {
@@ -150,10 +145,8 @@ class SWATSUpdate
       v *= parent.beta2;
       v += (1 - parent.beta2) * (gradient % gradient);
 
-      const double biasCorrection1 = 1.0 - std::pow(parent.beta1,
-          parent.iteration);
-      const double biasCorrection2 = 1.0 - std::pow(parent.beta2,
-          parent.iteration);
+      const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration);
+      const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration);
 
       GradType delta = stepSize * m / biasCorrection1 /
           (arma::sqrt(v / biasCorrection2) + parent.epsilon);
@@ -167,8 +160,7 @@ class SWATSUpdate
             (1 - parent.beta2) * rate;
         parent.sgdRate = parent.sgdLambda / biasCorrection2;
 
-        if (std::abs(parent.sgdRate - rate) < parent.epsilon &&
-            parent.iteration > 1)
+        if (std::abs(parent.sgdRate - rate) < parent.epsilon && iteration > 1)
         {
           parent.phaseSGD = true;
           v.zeros();
@@ -188,6 +180,9 @@ class SWATSUpdate
 
     //! The exponential moving average of squared gradient values (SGD).
     GradType sgdV;
+
+    //! The number of iterations.
+    size_t iteration;
   };
 
  private:
@@ -200,9 +195,6 @@ class SWATSUpdate
   //! The second moment coefficient.
   double beta2;
 
-  //! The number of iterations.
-  size_t iteration;
-
   //! Wether to use the SGD or Adam update rule.
   bool phaseSGD;
 
diff --git a/inst/include/ensmallen_bits/yogi/yogi.hpp b/inst/include/ensmallen_bits/yogi/yogi.hpp
new file mode 100644
index 0000000..4529d24
--- /dev/null
+++ b/inst/include/ensmallen_bits/yogi/yogi.hpp
@@ -0,0 +1,189 @@
+/**
+ * @file yogi.hpp
+ * @author Marcus Edel 
+ *
+ * Class wrapper for the Yogi update Policy. Yogi is based on Adam with more
+ * fine grained effective learning rate control.
+ *
+ * ensmallen is free software; you may redistribute it and/or modify it under
+ * the terms of the 3-clause BSD license.  You should have received a copy of
+ * the 3-clause BSD license along with ensmallen.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+#ifndef ENSMALLEN_YOGI_YOGI_HPP
+#define ENSMALLEN_YOGI_YOGI_HPP
+
+#include <ensmallen_bits/sgd/sgd.hpp>
+#include "yogi_update.hpp"
+
+namespace ens {
+
+/**
+ * Yogi is an variation of Adam with more fine grained effective learning rate
+ * control.
+ *
+ * For more information, see the following.
+ *
+ * @code
+ * @inproceedings{Zaheer2018,
+ *   author    = {Zaheer, Manzil and Reddi, Sashank J. and Sachan, Devendra
+ *                and Kale, Satyen and Kumar, Sanjiv},
+ *   title     = {Adaptive Methods for Nonconvex Optimization},
+ *   year      = {2018},
+ *   publisher = {Curran Associates Inc.},
+ *   booktitle = {Proceedings of the 32nd International Conference on Neural
+ *                Information Processing Systems},
+ *   pages     = {9815–9825},
+ *   series    = {NIPS'18}
+ * }
+ * @endcode
+ *
+ * Yogi can optimize differentiable separable functions. For more details,
+ * see the documentation on function types included with this distribution or
+ * on the ensmallen website.
+ */
+class Yogi 
+{
+ public:
+  /**
+   * Construct the Yogi optimizer with the given function and parameters.
+   * Yogi is sensitive to its paramters and hence a good hyper paramater
+   * selection is necessary as its default may not fit every case.
+   *
+   * The maximum number of iterations refers to the maximum number of
+   * points that are processed (i.e., one iteration equals one point; one
+   * iteration does not equal one pass over the dataset).
+   *
+   * @param stepSize Step size for each iteration.
+   * @param batchSize Number of points to process in a single step.
+   * @param beta1 Exponential decay rate for the first moment estimates.
+   * @param beta2 Exponential decay rate for the weighted infinity norm
+   *     estimates.
+   * @param epsilon Value used to initialise the mean squared gradient
+   *     parameter.
+   * @param maxIterations Maximum number of iterations allowed (0 means no
+   *     limit).
+   * @param tolerance Maximum absolute tolerance to terminate algorithm.
+   * @param shuffle If true, the function order is shuffled; otherwise, each
+   *     function is visited in linear order.
+   * @param resetPolicy If true, parameters are reset before every Optimize
+   *     call; otherwise, their values are retained.
+   * @param exactObjective Calculate the exact objective (Default: estimate the
+   *        final objective obtained on the last pass over the data).
+   */
+  Yogi(const double stepSize = 0.001,
+       const size_t batchSize = 32,
+       const double beta1 = 0.9,
+       const double beta2 = 0.999,
+       const double epsilon = 1e-8,
+       const size_t maxIterations = 100000,
+       const double tolerance = 1e-5,
+       const bool shuffle = true,
+       const bool resetPolicy = true,
+       const bool exactObjective = false);
+
+  /**
+   * Optimize the given function using Yogi. The given starting point will be
+   * modified to store the finishing point of the algorithm, and the final
+   * objective value is returned.
+   *
+   * @tparam SeparableFunctionType Type of the function to optimize.
+   * @tparam MatType Type of matrix to optimize with.
+   * @tparam GradType Type of matrix to use to represent function gradients.
+   * @tparam CallbackTypes Types of callback functions.
+   * @param function Function to optimize.
+   * @param iterate Starting point (will be modified).
+   * @param callbacks Callback functions.
+   * @return Objective value of the final point.
+   */
+  template<typename SeparableFunctionType,
+           typename MatType,
+           typename GradType,
+           typename... CallbackTypes>
+  typename std::enable_if<IsArmaType<GradType>::value,
+      typename MatType::elem_type>::type
+  Optimize(SeparableFunctionType& function,
+           MatType& iterate,
+           CallbackTypes&&... callbacks)
+  {
+    return optimizer.Optimize<SeparableFunctionType, MatType, GradType,
+        CallbackTypes...>(function, iterate,
+        std::forward<CallbackTypes>(callbacks)...);
+  }
+
+  //! Forward the MatType as GradType.
+  template<typename SeparableFunctionType,
+           typename MatType,
+           typename... CallbackTypes>
+  typename MatType::elem_type Optimize(SeparableFunctionType& function,
+                                       MatType& iterate,
+                                       CallbackTypes&&... callbacks)
+  {
+    return Optimize<SeparableFunctionType, MatType, MatType,
+        CallbackTypes...>(function, iterate,
+        std::forward<CallbackTypes>(callbacks)...);
+  }
+
+  //! Get the step size.
+  double StepSize() const { return optimizer.StepSize(); }
+  //! Modify the step size.
+  double& StepSize() { return optimizer.StepSize(); }
+
+  //! Get the batch size.
+  size_t BatchSize() const { return optimizer.BatchSize(); }
+  //! Modify the batch size.
+  size_t& BatchSize() { return optimizer.BatchSize(); }
+
+  //! Get the smoothing parameter.
+  double Beta1() const { return optimizer.UpdatePolicy().Beta1(); }
+  //! Modify the smoothing parameter.
+  double& Beta1() { return optimizer.UpdatePolicy().Beta1(); }
+
+  //! Get the second moment coefficient.
+  double Beta2() const { return optimizer.UpdatePolicy().Beta2(); }
+  //! Modify the second moment coefficient.
+  double& Beta2() { return optimizer.UpdatePolicy().Beta2(); }
+
+  //! Get the value used to initialise the mean squared gradient parameter.
+  double Epsilon() const { return optimizer.UpdatePolicy().Epsilon(); }
+  //! Modify the value used to initialise the mean squared gradient parameter.
+  double& Epsilon() { return optimizer.UpdatePolicy().Epsilon(); }
+
+  //! Get the maximum number of iterations (0 indicates no limit).
+  size_t MaxIterations() const { return optimizer.MaxIterations(); }
+  //! Modify the maximum number of iterations (0 indicates no limit).
+  size_t& MaxIterations() { return optimizer.MaxIterations(); }
+
+  //! Get the tolerance for termination.
+  double Tolerance() const { return optimizer.Tolerance(); }
+  //! Modify the tolerance for termination.
+  double& Tolerance() { return optimizer.Tolerance(); }
+
+  //! Get whether or not the individual functions are shuffled.
+  bool Shuffle() const { return optimizer.Shuffle(); }
+  //! Modify whether or not the individual functions are shuffled.
+  bool& Shuffle() { return optimizer.Shuffle(); }
+
+  //! Get whether or not the actual objective is calculated.
+  bool ExactObjective() const { return optimizer.ExactObjective(); }
+  //! Modify whether or not the actual objective is calculated.
+  bool& ExactObjective() { return optimizer.ExactObjective(); }
+
+  //! Get whether or not the update policy parameters are reset before
+  //! Optimize call.
+  bool ResetPolicy() const { return optimizer.ResetPolicy(); }
+  //! Modify whether or not the update policy parameters
+  //! are reset before Optimize call.
+  bool& ResetPolicy() { return optimizer.ResetPolicy(); }
+
+  private:
+  //! The Stochastic Gradient Descent object with Yogi policy.
+  SGD<YogiUpdate> optimizer;
+};
+
+} // namespace ens
+
+// Include implementation.
+#include "yogi_impl.hpp"
+
+#endif
diff --git a/inst/include/ensmallen_bits/yogi/yogi_impl.hpp b/inst/include/ensmallen_bits/yogi/yogi_impl.hpp
new file mode 100644
index 0000000..39a777b
--- /dev/null
+++ b/inst/include/ensmallen_bits/yogi/yogi_impl.hpp
@@ -0,0 +1,44 @@
+/**
+ * @file yogi_impl.hpp
+ * @author Marcus Edel
+ *
+ * Implementation of Yogi class wrapper.
+ *
+ * ensmallen is free software; you may redistribute it and/or modify it under
+ * the terms of the 3-clause BSD license.  You should have received a copy of
+ * the 3-clause BSD license along with ensmallen.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+#ifndef ENSMALLEN_YOGI_YOGI_IMPL_HPP
+#define ENSMALLEN_YOGI_YOGI_IMPL_HPP
+
+// In case it hasn't been included yet.
+#include "yogi.hpp"
+
+namespace ens {
+
+inline Yogi::Yogi(
+    const double stepSize,
+    const size_t batchSize,
+    const double beta1,
+    const double beta2,
+    const double epsilon,
+    const size_t maxIterations,
+    const double tolerance,
+    const bool shuffle,
+    const bool resetPolicy,
+    const bool exactObjective) :
+    optimizer(stepSize,
+              batchSize,
+              maxIterations,
+              tolerance,
+              shuffle,
+              YogiUpdate(epsilon, beta1, beta2),
+              NoDecay(),
+              resetPolicy,
+              exactObjective)
+{ /* Nothing to do. */ }
+
+} // namespace ens
+
+ #endif
diff --git a/inst/include/ensmallen_bits/yogi/yogi_update.hpp b/inst/include/ensmallen_bits/yogi/yogi_update.hpp
new file mode 100644
index 0000000..cdba28d
--- /dev/null
+++ b/inst/include/ensmallen_bits/yogi/yogi_update.hpp
@@ -0,0 +1,146 @@
+/**
+ * @file yogi_update.hpp
+ * @author Marcus Edel
+ *
+ * Implements the Yogi Optimizer. Yogi is a variant of Adam with more fine
+ * grained effective learning rate control.
+ *
+ * ensmallen is free software; you may redistribute it and/or modify it under
+ * the terms of the 3-clause BSD license.  You should have received a copy of
+ * the 3-clause BSD license along with ensmallen.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+#ifndef ENSMALLEN_YOGI_YOGI_UPDATE_HPP
+#define ENSMALLEN_YOGI_YOGI_UPDATE_HPP
+
+namespace ens {
+
+/**
+ * Yogi builds upon the Adam update strategy but provides more fine grained
+ * effective learning rate control.
+ *
+ * For more information, see the following.
+ *
+ * @code
+ * @inproceedings{Zaheer2018,
+ *   author    = {Zaheer, Manzil and Reddi, Sashank J. and Sachan, Devendra
+ *                and Kale, Satyen and Kumar, Sanjiv},
+ *   title     = {Adaptive Methods for Nonconvex Optimization},
+ *   year      = {2018},
+ *   publisher = {Curran Associates Inc.},
+ *   booktitle = {Proceedings of the 32nd International Conference on Neural
+ *                Information Processing Systems},
+ *   pages     = {9815–9825},
+ *   series    = {NIPS'18}
+ * }
+ * @endcode
+ */
+class YogiUpdate
+{
+ public:
+  /**
+   * Construct the Yogi update policy with the given parameters.
+   *
+   * @param epsilon The epsilon value used to initialise the squared gradient
+   *     parameter.
+   * @param beta1 The smoothing parameter.
+   * @param beta2 The second moment coefficient.
+   * @param v1 The first quasi-hyperbolic term.
+   * @param v1 The second quasi-hyperbolic term.
+   */
+  YogiUpdate(const double epsilon = 1e-8,
+             const double beta1 = 0.9,
+             const double beta2 = 0.999) :
+    epsilon(epsilon),
+    beta1(beta1),
+    beta2(beta2)
+  {
+    // Nothing to do.
+  }
+
+  //! Get the value used to initialise the squared gradient parameter.
+  double Epsilon() const { return epsilon; }
+  //! Modify the value used to initialise the squared gradient parameter.
+  double& Epsilon() { return epsilon; }
+
+  //! Get the smoothing parameter.
+  double Beta1() const { return beta1; }
+  //! Modify the smoothing parameter.
+  double& Beta1() { return beta1; }
+
+  //! Get the second moment coefficient.
+  double Beta2() const { return beta2; }
+  //! Modify the second moment coefficient.
+  double& Beta2() { return beta2; }
+
+  /**
+   * The UpdatePolicyType policy classes must contain an internal 'Policy'
+   * template class with two template arguments: MatType and GradType.  This is
+   * instantiated at the start of the optimization, and holds parameters
+   * specific to an individual optimization.
+   */
+  template<typename MatType, typename GradType>
+  class Policy
+  {
+   public:
+    /**
+     * This constructor is called by the SGD Optimize() method before the start
+     * of the iteration update process.
+     *
+     * @param parent YogiUpdate object.
+     * @param rows Number of rows in the gradient matrix.
+     * @param cols Number of columns in the gradient matrix.
+     */
+    Policy(YogiUpdate& parent, const size_t rows, const size_t cols) :
+        parent(parent)
+    {
+      m.zeros(rows, cols);
+      v.zeros(rows, cols);
+    }
+
+    /**
+     * Update step for Yogi.
+     *
+     * @param iterate Parameters that minimize the function.
+     * @param stepSize Step size to be used for the given iteration.
+     * @param gradient The gradient matrix.
+     */
+    void Update(MatType& iterate,
+                const double stepSize,
+                const GradType& gradient)
+    {
+      m *= parent.beta1;
+      m += (1 - parent.beta1) * gradient;
+
+      const MatType gSquared = arma::square(gradient);
+      v -= (1 - parent.beta2) * arma::sign(v - gSquared) % gSquared;
+
+      // Now update the iterate.
+      iterate -= stepSize * m / (arma::sqrt(v) + parent.epsilon);
+    }
+
+   private:
+    //! Instantiated parent object.
+    YogiUpdate& parent;
+
+    //! The exponential moving average of gradient values.
+    GradType m;
+
+    // The exponential moving average of squared gradient values.
+    GradType v;
+  };
+
+ private:
+  // The epsilon value used to initialise the squared gradient parameter.
+  double epsilon;
+
+  // The smoothing parameter.
+  double beta1;
+
+  // The second moment coefficient.
+  double beta2;
+};
+
+} // namespace ens
+
+#endif
diff --git a/tools/HISTORYold.md b/tools/HISTORYold.md
index 3d8296b..a5978c4 100644
--- a/tools/HISTORYold.md
+++ b/tools/HISTORYold.md
@@ -1,3 +1,26 @@
+### ensmallen ?.??.?: "???"
+###### ????-??-??
+
+### ensmallen 2.19.0: "Eight Ball Deluxe"
+###### 2022-04-06
+* Added DemonSGD and DemonAdam optimizers
+   ([#211](https://github.com/mlpack/ensmallen/pull/211)).
+
+ * Fix bug with Adam-like optimizers not resetting when `resetPolicy` is `true`.
+   ([#340](https://github.com/mlpack/ensmallen/pull/340)).
+
+ * Add Yogi optimizer
+   ([#232](https://github.com/mlpack/ensmallen/pull/232)).
+
+ * Add AdaBelief optimizer
+   ([#233](https://github.com/mlpack/ensmallen/pull/233)).
+
+ * Add AdaSqrt optimizer
+   ([#234](https://github.com/mlpack/ensmallen/pull/234)).
+   
+ * Bump check for minimum supported version of Armadillo
+   ([#342](https://github.com/mlpack/ensmallen/pull/342)).
+
 ### ensmallen 2.18.2: "Fairmount Bagel"
 ###### 2022-02-13
  * Update Catch2 to 2.13.8