mlpack · rcurtin · May 15, 2019 · Feb 12, 2019 · Feb 15, 2019 · Feb 16, 2019
diff --git a/doc/function_types.md b/doc/function_types.md
@@ -52,7 +52,7 @@ class SquaredFunction
 
 int main()
 {
-  // The minimum is at x = [0 0 0].  Our initial point is chosen to be 
+  // The minimum is at x = [0 0 0].  Our initial point is chosen to be
   // [1.0, -1.0, 1.0].
   arma::mat x("1.0 -1.0 1.0");
 
@@ -146,7 +146,7 @@ class LinearRegressionFunction
  public:
   // Construct the object with the given data matrix and responses.
   LinearRegressionFunction(const arma::mat& dataIn,
-                           const arma::rowvec& responsesIn) : 
+                           const arma::rowvec& responsesIn) :
       data(dataIn), responses(responsesIn) { }
 
   // Return the objective function for model parameters x.
@@ -173,13 +173,13 @@ class LinearRegressionEWGFunction
 {
  public:
   // Construct the object with the given data matrix and responses.
-  LinearRegressionEWGFunction(const arma::mat& dataIn, 
-                              const arma::rowvec& responsesIn) : 
+  LinearRegressionEWGFunction(const arma::mat& dataIn,
+                              const arma::rowvec& responsesIn) :
       data(dataIn), responses(responsesIn) { }
 
   // Simultaneously compute both the objective function and gradient for model
   // parameters x.  Note that this is faster than implementing Evaluate() and
-  // Gradient() individually because it caches the computation of 
+  // Gradient() individually because it caches the computation of
   // (responses - x.t() * data)!
   double EvaluateWithGradient(const arma::mat& x, arma::mat& g)
   {
@@ -483,7 +483,7 @@ class ArbitrarySeparableFunctionType
   //
   // Given parameters x and a matrix g, return the sum of the individual
   // functions f_i(x) + ... + f_{i + batchSize - 1}(x), and store the sum of
-  // the gradient of individual functions f'_i(x) + ... + 
+  // the gradient of individual functions f'_i(x) + ... +
   // f'_{i + batchSize - 1}(x) into the provided matrix g.  g should have the
   // same size (rows, columns) as x.  i will always be greater than 0, and i +
   // batchSize will be less than or equal to the value of NumFunctions().
@@ -518,6 +518,8 @@ The following optimizers can be used with differentiable functions:
  - [NadaMax](#nadamax)
  - [NesterovMomentumSGD](#nesterov-momentum-sgd)
  - [OptimisticAdam](#optimisticadam)
+ - [QHAdam](#qhadam)
+ - [QHSGD](#qhsgd)
  - [RMSProp](#rmsprop)
  - [SARAH/SARAH+](#stochastic-recursive-gradient-algorithm-sarahsarah)
  - [SGD](#standard-sgd)
@@ -848,7 +850,7 @@ Example code showing how to solve an SDP is given below.
 ```c++
 int main()
 {
-  // We will build a toy semidefinite program and then use the PrimalDualSolver to find a solution 
+  // We will build a toy semidefinite program and then use the PrimalDualSolver to find a solution
 
   // The semi-definite constraint looks like:
   //

diff --git a/doc/optimizers.md b/doc/optimizers.md
@@ -65,7 +65,7 @@ parameters.
  - `AdaGrad(`_`stepSize, batchSize`_`)`
  - `AdaGrad(`_`stepSize, batchSize, epsilon, maxIterations, tolerance, shuffle`_`)`
  - `AdaGrad(`_`stepSize, batchSize, epsilon, maxIterations, tolerance, shuffle, resetPolicy`_`)`
- 
+
 #### Attributes
 
 | **type** | **name** | **description** | **default** |
@@ -932,7 +932,7 @@ proximalOptimizer.Optimize(f, coordinates);
 *An optimizer for [differentiable functions](#differentiable-functions)*
 
 L-BFGS is an optimization algorithm in the family of quasi-Newton methods that approximates the Broyden-Fletcher-Goldfarb-Shanno (BFGS) algorithm using a limited amount of computer memory.  
-  
+
 #### Constructors
 
  * `L_BFGS()`
@@ -1394,6 +1394,117 @@ double Optimize(arma::mat& X);
  * [Semidefinite programming on Wikipedia](https://en.wikipedia.org/wiki/Semidefinite_programming)
  * [Semidefinite programs](#semidefinite-programs) (includes example usage of `PrimalDualSolver`)
 
+## Quasi-Hyperbolic Momentum Update SGD (QHSGD)
+
+ *An optimizer for [differentiable separable functions](#differentiable-separable-functions).*
+
+ Quasi-hyperbolic momentum update SGD (QHSGD) is an SGD-like optimizer with momentum where quasi-hyperbolic terms are added to the parametrization.
+ The update rule for this optimizer is a weighted average of momentum SGD and vanilla SGD.
+
+
+#### Constructors
+
+  * `QHSGD()`
+  * `QHSGD(`_`stepSize, batchSize`_`)`
+  * `QHSGD(`_`stepSize, batchSize, maxIterations, tolerance, shuffle`_`)`
+
+ Note that `QHSGD` is based on the templated type
+ `SGD<`_`UpdatePolicyType, DecayPolicyType`_`>` with _`UpdatePolicyType`_` =
+ QHUpdate` and _`DecayPolicyType`_` = NoDecay`.
+
+#### Attributes
+
+ | **type** | **name** | **description** | **default** |
+ |----------|----------|-----------------|-------------|
+ | `double` | **`stepSize`** | Step size for each iteration. | `0.01` |
+ | `size_t` | **`batchSize`** | Batch size to use for each step. | `32` |
+ | `size_t` | **`maxIterations`** | Maximum number of iterations allowed (0 means no limit). | `100000` |
+ | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` |
+ | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` |
+
+ Attributes of the optimizer may also be modified via the member methods
+ `StepSize()`, `BatchSize()`, `MaxIterations()`, `Tolerance()`, and `Shuffle()`.
+
+ Note that the `QHUpdate` class has the constructor  `QHUpdate(`_`v, momentum`_`)` with a
+ default value of `0.7` for the quasi hyperbolic term and `0.999` for the momentum term.
+
+#### Examples
+
+ ```c++
+ RosenbrockFunction f;
+ arma::mat coordinates = f.GetInitialPoint();
+
+ QHSGD optimizer(0.01, 32, 100000, 1e-5, true);
+ optimizer.Optimize(f, coordinates);
+ ```
+
+#### See also:
+
+  * [Quasi-Hyperbolic Momentum and Adam For Deep Learning](https://arxiv.org/pdf/1810.06801.pdf)
+  * [Momentum SGD](#momentum-sgd)
+  * [Nesterov Momentum SGD](#nesterov-momentum-sgd)
+  * [SGD in Wikipedia](https://en.wikipedia.org/wiki/Stochastic_gradient_descent)
+  * [Differentiable separable functions](#differentiable-separable-functions)
+
+## QHAdam
+
+ *An optimizer for [differentiable separable functions](#differentiable-separable-functions).*
+
+ QHAdam is an optimizer that uses quasi-hyperbolic descent with the Adam optimizer.  This replaces the moment estimators of Adam with quasi-hyperbolic terms, and different values of the `v1` and `v2` parameters are equivalent to the following other optimizers:
+
+  * When `v1 = v2 = 1`, `QHAdam` is equivalent to `Adam`.
+
+  * When `v1 = 0` and `v2 = 1`, `QHAdam` is equivalent to `RMSProp`.
+
+  * When `v1 = beta1` and `v2 = 1`, `QHAdam` is equivalent to `Nadam`.
+
+#### Constructors
+
+  * `QHAdam()`
+  * `QHAdam(`_`stepSize, batchSize`_`)`
+  * `QHAdam(`_`stepSize, batchSize, v1, v2, beta1, beta2, eps, maxIterations`_`)`
+  * `QHAdam(`_`stepSize, batchSize, v1, v2, beta1, beta2, eps, maxIterations, tolerance, shuffle, resetPolicy`_`)`
+
+#### Attributes
+
+ | **type** | **name** | **description** | **default** |
+ |----------|----------|-----------------|-------------|
+ | `double` | **`stepSize`** | Step size for each iteration. | `0.001` |
+ | `size_t` | **`batchSize`** | Number of points to process in a single step. | `32` |
+ | `double` | **`v1`** | The First Quasi Hyperbolic Term. | `0.7` |
+ | `double` | **`v2`** | The Second Quasi Hyperbolic Term. | `1.00` |
+ | `double` | **`beta1`** | Exponential decay rate for the first moment estimates. | `0.9` |
+ | `double` | **`beta2`** | Exponential decay rate for the weighted infinity norm estimates. | `0.999` |
+ | `double` | **`eps`** | Value used to initialize the mean squared gradient parameter. | `1e-8` |
+ | `size_t` | **`max_iterations`** | Maximum number of iterations allowed (0 means no limit). | `100000` |
+ | `double` | **`tolerance`** | Maximum absolute tolerance to terminate algorithm. | `1e-5` |
+ | `bool` | **`shuffle`** | If true, the function order is shuffled; otherwise, each function is visited in linear order. | `true` |
+ | `bool` | **`resetPolicy`** | If true, parameters are reset before every Optimize call; otherwise, their values are retained. | `true` |
+
+ The attributes of the optimizer may also be modified via the member methods
+ `StepSize()`, `BatchSize()`, `Beta1()`, `Beta2()`, `Eps()`, `MaxIterations()`,
+ `Tolerance()`, `Shuffle()`,`V1()`,`V2()`, and `ResetPolicy()`.
+
+#### Examples
+
+ ```c++
+ RosenbrockFunction f;
+ arma::mat coordinates = f.GetInitialPoint();
+
+ QHAdam optimizer(0.001, 32, 0.7, 0.9, 0.9, 0.999, 1e-8, 100000, 1e-5, true);
+ optimizer.Optimize(f, coordinates);
+ ```
+
+#### See also:
+
+  * [Quasi-Hyperbolic Momentum and Adam For Deep Learning](https://arxiv.org/pdf/1810.06801.pdf)
+  * [SGD in Wikipedia](https://en.wikipedia.org/wiki/Stochastic_gradient_descent)
+  * [SGD](#standard-sgd)
+  * [Adam: A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980)
+  * [Divide the gradient by a running average of its recent magnitude](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
+  * [Incorporating Nesterov Momentum into Adam](http://cs229.stanford.edu/proj2015/054_report.pdf)
+  * [Differentiable separable functions](#differentiable-separable-functions)
+
 ## RMSProp
 
 *An optimizer for [differentiable separable functions](#differentiable-separable-functions).*
@@ -1737,7 +1848,7 @@ cyclicscd.Optimize(f, coordinates);
 
 *An optimizer for [differentiable separable functions](#differentiable-separable-functions).*
 
-SGDR is based on Mini-batch Stochastic Gradient Descent class and simulates a new warm-started run/restart once a number of epochs are performed. 
+SGDR is based on Mini-batch Stochastic Gradient Descent class and simulates a new warm-started run/restart once a number of epochs are performed.
 
 #### Constructors
 

diff --git a/include/ensmallen.hpp b/include/ensmallen.hpp
@@ -64,6 +64,7 @@
 #include "ensmallen_bits/ada_delta/ada_delta.hpp"
 #include "ensmallen_bits/ada_grad/ada_grad.hpp"
 #include "ensmallen_bits/adam/adam.hpp"
+#include "ensmallen_bits/qhadam/qhadam.hpp"
 #include "ensmallen_bits/aug_lagrangian/aug_lagrangian.hpp"
 #include "ensmallen_bits/bigbatch_sgd/bigbatch_sgd.hpp"
 #include "ensmallen_bits/cmaes/cmaes.hpp"

diff --git a/include/ensmallen_bits/adam/adam.hpp b/include/ensmallen_bits/adam/adam.hpp
@@ -11,7 +11,7 @@
  * for first-order gradient-based optimization of stochastic objective
  * functions, based on adaptive estimates of lower-order moments. AdaMax is
  * simply a variant of Adam based on the infinity norm. AMSGrad is another
- * variant of Adam with guaranteed convergence. Nadam is another variant of 
+ * variant of Adam with guaranteed convergence. Nadam is another variant of
  * Adam based on NAG. NadaMax is a variant for Nadam based on Infinity form.
  *
  * ensmallen is free software; you may redistribute it and/or modify it under

diff --git a/include/ensmallen_bits/qhadam/qhadam.hpp b/include/ensmallen_bits/qhadam/qhadam.hpp
@@ -0,0 +1,164 @@
+/**
+ * @file qhadam.hpp
+ * @author Niteya Shah
+ *
+ * Class wrapper for the QHAdam update Policy. QHAdam is a variant of the Adam
+ * based on quasi hyperbolic moments.
+ *
+ * ensmallen is free software; you may redistribute it and/or modify it under
+ * the terms of the 3-clause BSD license.  You should have received a copy of
+ * the 3-clause BSD license along with ensmallen.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+#ifndef ENSMALLEN_ADAM_QHADAM_HPP
+#define ENSMALLEN_ADAM_QHADAM_HPP
+
+#include <ensmallen_bits/sgd/sgd.hpp>
+#include "qhadam_update.hpp"
+
+ namespace ens {
+
+/**
+ * QHadam is an variation of Adam with Quasi-Hyperbolic step. It can be
+ * a weighted mean of the momentum step. Due to its paramterisation it can
+ * recover many other optimisation strategies.
+ *
+ * For more information, see the following.
+ *
+ * @code
+ * @inproceedings{ma2019qh,
+ *   title={Quasi-hyperbolic momentum and Adam for deep learning},
+ *   author={Jerry Ma and Denis Yarats},
+ *   booktitle={International Conference on Learning Representations},
+ *   year={2019}
+ * }
+ * @endcode
+ *
+ * QHAdam can optimize differentiable separable functions. For more details,
+ * see the documentation on function types included with this distribution or
+ * on the ensmallen website.
+ */
+class QHAdam
+{
+ public:
+  /**
+   * Construct the QHAdam optimizer with the given function and parameters.
+   * QHAdam is sensitive to its paramters and hence a good hyper paramater
+   * selection is necessary as its default may not fit every case.
+   *
+   * The maximum number of iterations refers to the maximum number of
+   * points that are processed (i.e., one iteration equals one point; one
+   * iteration does not equal one pass over the dataset).
+   *
+   * @param stepSize Step size for each iteration.
+   * @param batchSize Number of points to process in a single step.
+   * @param v1 The first quasi-hyperbolic term.
+   * @param v1 The second quasi-hyperbolic term.
+   * @param beta1 Exponential decay rate for the first moment estimates.
+   * @param beta2 Exponential decay rate for the weighted infinity norm
+            estimates.
+   * @param epsilon Value used to initialise the mean squared gradient parameter.
+   * @param maxIterations Maximum number of iterations allowed (0 means no
+   *        limit).
+   * @param tolerance Maximum absolute tolerance to terminate algorithm.
+   * @param shuffle If true, the function order is shuffled; otherwise, each
+   *        function is visited in linear order.
+   * @param resetPolicy If true, parameters are reset before every Optimize
+   *        call; otherwise, their values are retained.
+   */
+   QHAdam(const double stepSize = 0.001,
+          const size_t batchSize = 32,
+          const double v1 = 0.7,
+          const double v2 = 1,
+          const double beta1 = 0.9,
+          const double beta2 = 0.999,
+          const double epsilon = 1e-8,
+          const size_t maxIterations = 100000,
+          const double tolerance = 1e-5,
+          const bool shuffle = true,
+          const bool resetPolicy = true);
+
+   /**
+    * Optimize the given function using QHAdam. The given starting point will be
+    * modified to store the finishing point of the algorithm, and the final
+    * objective value is returned.
+    *
+    * @tparam DecomposableFunctionType Type of the function to optimize.
+    * @param function Function to optimize.
+    * @param iterate Starting point (will be modified).
+    * @return Objective value of the final point.
+    */
+  template<typename DecomposableFunctionType>
+  double Optimize(DecomposableFunctionType& function, arma::mat& iterate)
+  {
+    return optimizer.Optimize(function, iterate);
+  }
+
+   //! Get the step size.
+  double StepSize() const { return optimizer.StepSize(); }
+  //! Modify the step size.
+  double& StepSize() { return optimizer.StepSize(); }
+
+   //! Get the batch size.
+  size_t BatchSize() const { return optimizer.BatchSize(); }
+  //! Modify the batch size.
+  size_t& BatchSize() { return optimizer.BatchSize(); }
+
+   //! Get the smoothing parameter.
+  double Beta1() const { return optimizer.UpdatePolicy().Beta1(); }
+  //! Modify the smoothing parameter.
+  double& Beta1() { return optimizer.UpdatePolicy().Beta1(); }
+
+   //! Get the second moment coefficient.
+  double Beta2() const { return optimizer.UpdatePolicy().Beta2(); }
+  //! Modify the second moment coefficient.
+  double& Beta2() { return optimizer.UpdatePolicy().Beta2(); }
+
+   //! Get the value used to initialise the mean squared gradient parameter.
+  double Epsilon() const { return optimizer.UpdatePolicy().Epsilon(); }
+  //! Modify the value used to initialise the mean squared gradient parameter.
+  double& Epsilon() { return optimizer.UpdatePolicy().Epsilon(); }
+
+   //! Get the maximum number of iterations (0 indicates no limit).
+  size_t MaxIterations() const { return optimizer.MaxIterations(); }
+  //! Modify the maximum number of iterations (0 indicates no limit).
+  size_t& MaxIterations() { return optimizer.MaxIterations(); }
+
+   //! Get the tolerance for termination.
+  double Tolerance() const { return optimizer.Tolerance(); }
+  //! Modify the tolerance for termination.
+  double& Tolerance() { return optimizer.Tolerance(); }
+
+   //! Get whether or not the individual functions are shuffled.
+  bool Shuffle() const { return optimizer.Shuffle(); }
+  //! Modify whether or not the individual functions are shuffled.
+  bool& Shuffle() { return optimizer.Shuffle(); }
+
+   //! Get whether or not the update policy parameters
+  //! are reset before Optimize call.
+  bool ResetPolicy() const { return optimizer.ResetPolicy(); }
+  //! Modify whether or not the update policy parameters
+  //! are reset before Optimize call.
+  bool& ResetPolicy() { return optimizer.ResetPolicy(); }
+
+   //! Get the first quasi hyperbolic parameter.
+  double V1() const { return optimizer.UpdatePolicy().V1(); }
+  //! Modify the first quasi hyperbolic parameter.
+  double& V1() { return optimizer.UpdatePolicy().V1(); }
+
+  //! Get the second quasi hyperbolic parameter.
+  double V2() const { return optimizer.UpdatePolicy().V2(); }
+  //! Modify the second quasi hyperbolic parameter.
+  double& V2() { return optimizer.UpdatePolicy().V2(); }
+
+  private:
+  //! The Stochastic Gradient Descent object with QHAdam policy.
+  SGD<QHAdamUpdate> optimizer;
+};
+
+} // namespace ens
+
+ // Include implementation.
+#include "qhadam_impl.hpp"
+
+ #endif