From 059893eaa95bf2857f27aa294bf59106cb1bd76e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 11 Apr 2022 11:31:18 -0500 Subject: [PATCH] Upgrade ensmallen to 2.19.0 (#49) Co-authored-by: coatless --- ChangeLog | 7 + DESCRIPTION | 2 +- NEWS.md | 17 ++ inst/include/ensmallen.hpp | 9 +- .../ensmallen_bits/ada_belief/ada_belief.hpp | 186 ++++++++++++++++ .../ada_belief/ada_belief_impl.hpp | 44 ++++ .../ada_belief/ada_belief_update.hpp | 153 +++++++++++++ .../ada_bound/ada_bound_update.hpp | 30 +-- .../ada_bound/ams_bound_update.hpp | 30 +-- .../ensmallen_bits/ada_sqrt/ada_sqrt.hpp | 168 ++++++++++++++ .../ensmallen_bits/ada_sqrt/ada_sqrt_impl.hpp | 38 ++++ .../ada_sqrt/ada_sqrt_update.hpp | 118 ++++++++++ .../ensmallen_bits/adam/adam_update.hpp | 25 +-- .../ensmallen_bits/adam/adamax_update.hpp | 21 +- .../ensmallen_bits/adam/amsgrad_update.hpp | 25 +-- .../ensmallen_bits/adam/nadam_update.hpp | 28 +-- .../ensmallen_bits/adam/nadamax_update.hpp | 23 +- .../adam/optimisticadam_update.hpp | 28 +-- .../ensmallen_bits/demon_adam/demon_adam.hpp | 207 ++++++++++++++++++ .../demon_adam/demon_adam_update.hpp | 169 ++++++++++++++ .../ensmallen_bits/demon_sgd/demon_sgd.hpp | 178 +++++++++++++++ .../demon_sgd/demon_sgd_update.hpp | 139 ++++++++++++ inst/include/ensmallen_bits/ens_version.hpp | 10 +- .../ensmallen_bits/ftml/ftml_update.hpp | 22 +- .../fw/proximal/proximal_impl.hpp | 2 +- .../ensmallen_bits/lbfgs/lbfgs_impl.hpp | 4 +- .../ensmallen_bits/padam/padam_update.hpp | 25 +-- .../ensmallen_bits/qhadam/qhadam_update.hpp | 25 +-- inst/include/ensmallen_bits/sa/sa_impl.hpp | 2 +- .../spalera_sgd/spalera_sgd_impl.hpp | 1 - .../ensmallen_bits/swats/swats_update.hpp | 26 +-- inst/include/ensmallen_bits/yogi/yogi.hpp | 189 ++++++++++++++++ .../include/ensmallen_bits/yogi/yogi_impl.hpp | 44 ++++ .../ensmallen_bits/yogi/yogi_update.hpp | 146 ++++++++++++ tools/HISTORYold.md | 23 ++ 35 files changed, 1951 insertions(+), 213 deletions(-) create mode 100644 inst/include/ensmallen_bits/ada_belief/ada_belief.hpp create mode 100644 inst/include/ensmallen_bits/ada_belief/ada_belief_impl.hpp create mode 100644 inst/include/ensmallen_bits/ada_belief/ada_belief_update.hpp create mode 100644 inst/include/ensmallen_bits/ada_sqrt/ada_sqrt.hpp create mode 100644 inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_impl.hpp create mode 100644 inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_update.hpp create mode 100644 inst/include/ensmallen_bits/demon_adam/demon_adam.hpp create mode 100644 inst/include/ensmallen_bits/demon_adam/demon_adam_update.hpp create mode 100644 inst/include/ensmallen_bits/demon_sgd/demon_sgd.hpp create mode 100644 inst/include/ensmallen_bits/demon_sgd/demon_sgd_update.hpp create mode 100644 inst/include/ensmallen_bits/yogi/yogi.hpp create mode 100644 inst/include/ensmallen_bits/yogi/yogi_impl.hpp create mode 100644 inst/include/ensmallen_bits/yogi/yogi_update.hpp diff --git a/ChangeLog b/ChangeLog index 4033cfd..5b2142b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2022-04-11 James Balamuta + + * DESCRIPTION (Version): Release 2.19.0 + * NEWS.md: Update for Ensmallen release 2.19.0 + * inst/include/ensmallen_bits: Upgraded to Ensmallen 2.19.0 + * inst/include/ensmallen.hpp: ditto + 2022-02-18 James Balamuta * DESCRIPTION: Update URLs diff --git a/DESCRIPTION b/DESCRIPTION index ced4419..a23034a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: RcppEnsmallen Title: Header-Only C++ Mathematical Optimization Library for 'Armadillo' -Version: 0.2.18.2.1 +Version: 0.2.19.0.1 Authors@R: c( person("James Joseph", "Balamuta", email = "balamut2@illinois.edu", role = c("aut", "cre", "cph"), diff --git a/NEWS.md b/NEWS.md index 3637526..f2e8938 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,20 @@ +# RcppEnsmallen 0.2.19.0.1 + +- Upgraded to ensmallen 2.19.0: "Eight Ball Deluxe" (2022-04-11) + - Added DemonSGD and DemonAdam optimizers + ([#211](https://github.com/mlpack/ensmallen/pull/211)). + - Fix bug with Adam-like optimizers not resetting when `resetPolicy` is `true`. + ([#340](https://github.com/mlpack/ensmallen/pull/340)). + - Add Yogi optimizer + ([#232](https://github.com/mlpack/ensmallen/pull/232)). + - Add AdaBelief optimizer + ([#233](https://github.com/mlpack/ensmallen/pull/233)). + - Add AdaSqrt optimizer + ([#234](https://github.com/mlpack/ensmallen/pull/234)). + + - Bump check for minimum supported version of Armadillo + ([#342](https://github.com/mlpack/ensmallen/pull/342)). + # RcppEnsmallen 0.2.18.2.1 - Upgraded to ensmallen 2.18.2: "Fairmount Bagel" (2022-02-14) diff --git a/inst/include/ensmallen.hpp b/inst/include/ensmallen.hpp index 010c5cb..a1338e8 100644 --- a/inst/include/ensmallen.hpp +++ b/inst/include/ensmallen.hpp @@ -29,8 +29,8 @@ #error "please enable C++11/C++14 mode in your compiler" #endif -#if ((ARMA_VERSION_MAJOR < 8) || ((ARMA_VERSION_MAJOR == 8) && (ARMA_VERSION_MINOR < 400))) - #error "need Armadillo version 8.400 or later" +#if ((ARMA_VERSION_MAJOR < 9) || ((ARMA_VERSION_MAJOR == 9) && (ARMA_VERSION_MINOR < 800))) + #error "need Armadillo version 9.800 or later" #endif #include @@ -85,10 +85,14 @@ #include "ensmallen_bits/problems/problems.hpp" // TODO: should move to another place +#include "ensmallen_bits/ada_belief/ada_belief.hpp" #include "ensmallen_bits/ada_bound/ada_bound.hpp" #include "ensmallen_bits/ada_delta/ada_delta.hpp" #include "ensmallen_bits/ada_grad/ada_grad.hpp" +#include "ensmallen_bits/ada_sqrt/ada_sqrt.hpp" #include "ensmallen_bits/adam/adam.hpp" +#include "ensmallen_bits/demon_adam/demon_adam.hpp" +#include "ensmallen_bits/demon_sgd/demon_sgd.hpp" #include "ensmallen_bits/qhadam/qhadam.hpp" #include "ensmallen_bits/aug_lagrangian/aug_lagrangian.hpp" #include "ensmallen_bits/bigbatch_sgd/bigbatch_sgd.hpp" @@ -131,5 +135,6 @@ #include "ensmallen_bits/svrg/svrg.hpp" #include "ensmallen_bits/swats/swats.hpp" #include "ensmallen_bits/wn_grad/wn_grad.hpp" +#include "ensmallen_bits/yogi/yogi.hpp" #endif diff --git a/inst/include/ensmallen_bits/ada_belief/ada_belief.hpp b/inst/include/ensmallen_bits/ada_belief/ada_belief.hpp new file mode 100644 index 0000000..1a4b13c --- /dev/null +++ b/inst/include/ensmallen_bits/ada_belief/ada_belief.hpp @@ -0,0 +1,186 @@ +/** + * @file ada_belief.hpp + * @author Marcus Edel + * + * Class wrapper for the AdaBelief update Policy. The intuition for AdaBelief is + * to adapt the stepsize according to the "belief" in the current gradient + * direction. + * + * ensmallen is free software; you may redistribute it and/or modify it under + * the terms of the 3-clause BSD license. You should have received a copy of + * the 3-clause BSD license along with ensmallen. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef ENSMALLEN_ADA_BELIEF_HPP +#define ENSMALLEN_ADA_BELIEF_HPP + +#include +#include "ada_belief_update.hpp" + +namespace ens { + +/** + * The intuition for AdaBelief is to adapt the stepsize according to the + * "belief" in the current gradient direction. For more information, see the + * following. + * + * @code + * @misc{zhuang2020adabelief, + * title = {AdaBelief Optimizer: Adapting Stepsizes by the Belief in + * Observed Gradients}, + * author = {Juntang Zhuang and Tommy Tang and Sekhar Tatikonda and + * Nicha Dvornek and Yifan Ding and Xenophon Papademetris + * and James S. Duncan}, + * year = {2020}, + * eprint = {2010.07468}, + * archivePrefix = {arXiv}, + * } + * @endcode + * + * AdaBelief can optimize differentiable separable functions. For more details, + * see the documentation on function types included with this distribution or + * on the ensmallen website. + */ +class AdaBelief +{ + public: + /** + * Construct the AdaBelief optimizer with the given function and parameters. + * AdaBelief is sensitive to its parameters and hence a good hyperparameter + * selection is necessary as its default may not fit every case. + * + * The maximum number of iterations refers to the maximum number of + * points that are processed (i.e., one iteration equals one point; one + * iteration does not equal one pass over the dataset). + * + * @param stepSize Step size for each iteration. + * @param batchSize Number of points to process in a single step. + * @param beta1 The exponential decay rate for the 1st moment estimates. + * @param beta2 The exponential decay rate for the 2nd moment estimates. + * @param epsilon A small constant for numerical stability. + * @param maxIterations Maximum number of iterations allowed (0 means no + * limit). + * @param tolerance Maximum absolute tolerance to terminate algorithm. + * @param shuffle If true, the function order is shuffled; otherwise, each + * function is visited in linear order. + * @param resetPolicy If true, parameters are reset before every Optimize + * call; otherwise, their values are retained. + * @param exactObjective Calculate the exact objective (Default: estimate the + * final objective obtained on the last pass over the data). + */ + AdaBelief(const double stepSize = 0.001, + const size_t batchSize = 32, + const double beta1 = 0.9, + const double beta2 = 0.999, + const double epsilon = 1e-12, + const size_t maxIterations = 100000, + const double tolerance = 1e-5, + const bool shuffle = true, + const bool resetPolicy = true, + const bool exactObjective = false); + + /** + * Optimize the given function using AdaBelief. The given starting point will + * be modified to store the finishing point of the algorithm, and the final + * objective value is returned. + * + * @tparam SeparableFunctionType Type of the function to optimize. + * @tparam MatType Type of matrix to optimize with. + * @tparam GradType Type of matrix to use to represent function gradients. + * @tparam CallbackTypes Types of callback functions. + * @param function Function to optimize. + * @param iterate Starting point (will be modified). + * @param callbacks Callback functions. + * @return Objective value of the final point. + */ + template + typename std::enable_if::value, + typename MatType::elem_type>::type + Optimize(SeparableFunctionType& function, + MatType& iterate, + CallbackTypes&&... callbacks) + { + return optimizer.Optimize(function, iterate, + std::forward(callbacks)...); + } + + //! Forward the MatType as GradType. + template + typename MatType::elem_type Optimize(SeparableFunctionType& function, + MatType& iterate, + CallbackTypes&&... callbacks) + { + return Optimize(function, iterate, + std::forward(callbacks)...); + } + + //! Get the step size. + double StepSize() const { return optimizer.StepSize(); } + //! Modify the step size. + double& StepSize() { return optimizer.StepSize(); } + + //! Get the batch size. + size_t BatchSize() const { return optimizer.BatchSize(); } + //! Modify the batch size. + size_t& BatchSize() { return optimizer.BatchSize(); } + + //! Get the exponential decay rate for the 1st moment estimates. + double Beta1() const { return optimizer.UpdatePolicy().Beta1(); } + //! Modify the exponential decay rate for the 1st moment estimates. + double& Beta1() { return optimizer.UpdatePolicy().Beta1(); } + + //! Get the exponential decay rate for the 2nd moment estimates. + double Beta2() const { return optimizer.UpdatePolicy().Beta2(); } + //! Get the second moment coefficient. + double& Beta2() { return optimizer.UpdatePolicy().Beta2(); } + + //! Get the value for numerical stability. + double Epsilon() const { return optimizer.UpdatePolicy().Epsilon(); } + //! Modify the value used for numerical stability. + double& Epsilon() { return optimizer.UpdatePolicy().Epsilon(); } + + //! Get the maximum number of iterations (0 indicates no limit). + size_t MaxIterations() const { return optimizer.MaxIterations(); } + //! Modify the maximum number of iterations (0 indicates no limit). + size_t& MaxIterations() { return optimizer.MaxIterations(); } + + //! Get the tolerance for termination. + double Tolerance() const { return optimizer.Tolerance(); } + //! Modify the tolerance for termination. + double& Tolerance() { return optimizer.Tolerance(); } + + //! Get whether or not the individual functions are shuffled. + bool Shuffle() const { return optimizer.Shuffle(); } + //! Modify whether or not the individual functions are shuffled. + bool& Shuffle() { return optimizer.Shuffle(); } + + //! Get whether or not the actual objective is calculated. + bool ExactObjective() const { return optimizer.ExactObjective(); } + //! Modify whether or not the actual objective is calculated. + bool& ExactObjective() { return optimizer.ExactObjective(); } + + //! Get whether or not the update policy parameters are reset before + //! Optimize call. + bool ResetPolicy() const { return optimizer.ResetPolicy(); } + //! Modify whether or not the update policy parameters + //! are reset before Optimize call. + bool& ResetPolicy() { return optimizer.ResetPolicy(); } + + private: + //! The Stochastic Gradient Descent object with AdaBelief policy. + SGD optimizer; +}; + +} // namespace ens + +// Include implementation. +#include "ada_belief_impl.hpp" + +#endif diff --git a/inst/include/ensmallen_bits/ada_belief/ada_belief_impl.hpp b/inst/include/ensmallen_bits/ada_belief/ada_belief_impl.hpp new file mode 100644 index 0000000..485efe5 --- /dev/null +++ b/inst/include/ensmallen_bits/ada_belief/ada_belief_impl.hpp @@ -0,0 +1,44 @@ +/** + * @file ada_belief_impl.hpp + * @author Marcus Edel + * + * Implementation of AdaBelief class wrapper. + * + * ensmallen is free software; you may redistribute it and/or modify it under + * the terms of the 3-clause BSD license. You should have received a copy of + * the 3-clause BSD license along with ensmallen. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef ENSMALLEN_ADA_BELIEF_ADA_BELIEF_IMPL_HPP +#define ENSMALLEN_ADA_BELIEF_ADA_BELIEF_IMPL_HPP + +// In case it hasn't been included yet. +#include "ada_belief.hpp" + +namespace ens { + +inline AdaBelief::AdaBelief( + const double stepSize, + const size_t batchSize, + const double beta1, + const double beta2, + const double epsilon, + const size_t maxIterations, + const double tolerance, + const bool shuffle, + const bool resetPolicy, + const bool exactObjective) : + optimizer(stepSize, + batchSize, + maxIterations, + tolerance, + shuffle, + AdaBeliefUpdate(epsilon, beta1, beta2), + NoDecay(), + resetPolicy, + exactObjective) +{ /* Nothing to do. */ } + +} // namespace ens + + #endif diff --git a/inst/include/ensmallen_bits/ada_belief/ada_belief_update.hpp b/inst/include/ensmallen_bits/ada_belief/ada_belief_update.hpp new file mode 100644 index 0000000..f768987 --- /dev/null +++ b/inst/include/ensmallen_bits/ada_belief/ada_belief_update.hpp @@ -0,0 +1,153 @@ +/** + * @file ada_belief_update.hpp + * @author Marcus Edel + * + * AdaBelief optimizer update policy. The intuition for AdaBelief is to adapt + * the stepsize according to the "belief" in the current gradient direction. + * + * ensmallen is free software; you may redistribute it and/or modify it under + * the terms of the 3-clause BSD license. You should have received a copy of + * the 3-clause BSD license along with ensmallen. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef ENSMALLEN_ADA_BELIEF_ADA_BELIEF_UPDATE_HPP +#define ENSMALLEN_ADA_BELIEF_ADA_BELIEF_UPDATE_HPP + +namespace ens { + +/** + * The intuition for AdaBelief is to adapt the stepsize according to the + * "belief" in the current gradient direction. + * + * For more information, see the following. + * + * @code + * @misc{zhuang2020adabelief, + * title = {AdaBelief Optimizer: Adapting Stepsizes by the Belief in + * Observed Gradients}, + * author = {Juntang Zhuang and Tommy Tang and Sekhar Tatikonda and + * Nicha Dvornek and Yifan Ding and Xenophon Papademetris + * and James S. Duncan}, + * year = {2020}, + * eprint = {2010.07468}, + * archivePrefix = {arXiv}, + * } + * @endcode + */ +class AdaBeliefUpdate +{ + public: + /** + * Construct the AdaBelief update policy with the given parameters. + * + * @param epsilon A small constant for numerical stability. + * @param beta1 The exponential decay rate for the 1st moment estimates. + * @param beta2 The exponential decay rate for the 2nd moment estimates. + */ + AdaBeliefUpdate(const double epsilon = 1e-8, + const double beta1 = 0.9, + const double beta2 = 0.999) : + epsilon(epsilon), + beta1(beta1), + beta2(beta2) + { + // Nothing to do. + } + + //! Get the value for numerical stability. + double Epsilon() const { return epsilon; } + //! Modify the value used for numerical stability. + double& Epsilon() { return epsilon; } + + //! Get the exponential decay rate for the 1st moment estimates. + double Beta1() const { return beta1; } + //! Modify the exponential decay rate for the 1st moment estimates. + double& Beta1() { return beta1; } + + //! Get the exponential decay rate for the 2nd moment estimates. + double Beta2() const { return beta2; } + //! Modify the exponential decay rate for the 2nd moment estimates. + double& Beta2() { return beta2; } + + /** + * The UpdatePolicyType policy classes must contain an internal 'Policy' + * template class with two template arguments: MatType and GradType. This is + * instantiated at the start of the optimization, and holds parameters + * specific to an individual optimization. + */ + template + class Policy + { + public: + /** + * This constructor is called by the SGD Optimize() method before the start + * of the iteration update process. + * + * @param parent AdaBeliefUpdate object. + * @param rows Number of rows in the gradient matrix. + * @param cols Number of columns in the gradient matrix. + */ + Policy(AdaBeliefUpdate& parent, const size_t rows, const size_t cols) : + parent(parent), + iteration(0) + { + m.zeros(rows, cols); + s.zeros(rows, cols); + } + + /** + * Update step for AdaBelief. + * + * @param iterate Parameters that minimize the function. + * @param stepSize Step size to be used for the given iteration. + * @param gradient The gradient matrix. + */ + void Update(MatType& iterate, + const double stepSize, + const GradType& gradient) + { + // Increment the iteration counter variable. + ++iteration; + + m *= parent.beta1; + m += (1 - parent.beta1) * gradient; + + s *= parent.beta2; + s += (1 - parent.beta2) * arma::pow(gradient - m, 2.0) + parent.epsilon; + + const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration); + const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration); + + // And update the iterate. + iterate -= ((m / biasCorrection1) * stepSize) / (arma::sqrt(s / + biasCorrection2) + parent.epsilon); + } + + private: + //! Instantiated parent object. + AdaBeliefUpdate& parent; + + //! The exponential moving average of gradient values. + GradType m; + + // The exponential moving average of squared gradient values. + GradType s; + + // The number of iterations. + size_t iteration; + }; + + private: + // The epsilon value used to initialise the squared gradient parameter. + double epsilon; + + // The xponential decay rate for the 1st moment estimates. + double beta1; + + // The exponential decay rate for the 2nd moment estimates. + double beta2; +}; + +} // namespace ens + +#endif diff --git a/inst/include/ensmallen_bits/ada_bound/ada_bound_update.hpp b/inst/include/ensmallen_bits/ada_bound/ada_bound_update.hpp index 4aeeaab..3a84d87 100644 --- a/inst/include/ensmallen_bits/ada_bound/ada_bound_update.hpp +++ b/inst/include/ensmallen_bits/ada_bound/ada_bound_update.hpp @@ -56,8 +56,7 @@ class AdaBoundUpdate gamma(gamma), epsilon(epsilon), beta1(beta1), - beta2(beta2), - iteration(0) + beta2(beta2) { // Nothing to do. } @@ -87,11 +86,6 @@ class AdaBoundUpdate //! Modify the second moment coefficient. double& Beta2() { return beta2; } - //! Get the current iteration number. - size_t Iteration() const { return iteration; } - //! Modify the current iteration number. - size_t& Iteration() { return iteration; } - /** * The UpdatePolicyType policy classes must contain an internal 'Policy' * template class with two template arguments: MatType and GradType. This is @@ -111,7 +105,7 @@ class AdaBoundUpdate * @param cols Number of columns in the gradient matrix. */ Policy(AdaBoundUpdate& parent, const size_t rows, const size_t cols) : - parent(parent), first(true), initialStepSize(0) + parent(parent), first(true), initialStepSize(0), iteration(0) { m.zeros(rows, cols); v.zeros(rows, cols); @@ -139,7 +133,7 @@ class AdaBoundUpdate } // Increment the iteration counter variable. - ++parent.iteration; + ++iteration; // Decay the first and second moment running average coefficient. m *= parent.beta1; @@ -148,16 +142,12 @@ class AdaBoundUpdate v *= parent.beta2; v += (1 - parent.beta2) * (gradient % gradient); - const ElemType biasCorrection1 = 1.0 - std::pow(parent.beta1, - parent.iteration); - const ElemType biasCorrection2 = 1.0 - std::pow(parent.beta2, - parent.iteration); + const ElemType biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration); + const ElemType biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration); const ElemType fl = parent.finalLr * stepSize / initialStepSize; - const ElemType lower = fl * (1.0 - 1.0 / (parent.gamma * - parent.iteration + 1)); - const ElemType upper = fl * (1.0 + 1.0 / (parent.gamma * - parent.iteration)); + const ElemType lower = fl * (1.0 - 1.0 / (parent.gamma * iteration + 1)); + const ElemType upper = fl * (1.0 + 1.0 / (parent.gamma * iteration)); // Applies bounds on actual learning rate. iterate -= arma::clamp((stepSize * @@ -180,6 +170,9 @@ class AdaBoundUpdate // The initial (Adam) learning rate. double initialStepSize; + + // The number of iterations. + size_t iteration; }; private: @@ -197,9 +190,6 @@ class AdaBoundUpdate // The second moment coefficient. double beta2; - - // The number of iterations. - size_t iteration; }; } // namespace ens diff --git a/inst/include/ensmallen_bits/ada_bound/ams_bound_update.hpp b/inst/include/ensmallen_bits/ada_bound/ams_bound_update.hpp index b2d1b98..270f8eb 100644 --- a/inst/include/ensmallen_bits/ada_bound/ams_bound_update.hpp +++ b/inst/include/ensmallen_bits/ada_bound/ams_bound_update.hpp @@ -56,8 +56,7 @@ class AMSBoundUpdate gamma(gamma), epsilon(epsilon), beta1(beta1), - beta2(beta2), - iteration(0) + beta2(beta2) { // Nothing to do. } @@ -87,11 +86,6 @@ class AMSBoundUpdate //! Modify the second moment coefficient. double& Beta2() { return beta2; } - //! Get the current iteration number. - size_t Iteration() const { return iteration; } - //! Modify the current iteration number. - size_t& Iteration() { return iteration; } - /** * The UpdatePolicyType policy classes must contain an internal 'Policy' * template class with two template arguments: MatType and GradType. This is @@ -111,7 +105,7 @@ class AMSBoundUpdate * @param cols Number of columns in the gradient matrix. */ Policy(AMSBoundUpdate& parent, const size_t rows, const size_t cols) : - parent(parent), first(true), initialStepSize(0) + parent(parent), first(true), initialStepSize(0), iteration(0) { m.zeros(rows, cols); v.zeros(rows, cols); @@ -140,7 +134,7 @@ class AMSBoundUpdate } // Increment the iteration counter variable. - ++parent.iteration; + ++iteration; // Decay the first and second moment running average coefficient. m *= parent.beta1; @@ -149,16 +143,12 @@ class AMSBoundUpdate v *= parent.beta2; v += (1 - parent.beta2) * (gradient % gradient); - const ElemType biasCorrection1 = 1.0 - std::pow(parent.beta1, - parent.iteration); - const ElemType biasCorrection2 = 1.0 - std::pow(parent.beta2, - parent.iteration); + const ElemType biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration); + const ElemType biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration); const ElemType fl = parent.finalLr * stepSize / initialStepSize; - const ElemType lower = fl * (1.0 - 1.0 / (parent.gamma * - parent.iteration + 1)); - const ElemType upper = fl * (1.0 + 1.0 / (parent.gamma * - parent.iteration)); + const ElemType lower = fl * (1.0 - 1.0 / (parent.gamma * iteration + 1)); + const ElemType upper = fl * (1.0 + 1.0 / (parent.gamma * iteration)); // Element wise maximum of past and present squared gradients. vImproved = arma::max(vImproved, v); @@ -187,6 +177,9 @@ class AMSBoundUpdate // The optimal squared gradient value. GradType vImproved; + + // The number of iterations. + size_t iteration; }; private: @@ -204,9 +197,6 @@ class AMSBoundUpdate // The second moment coefficient. double beta2; - - // The number of iterations. - size_t iteration; }; } // namespace ens diff --git a/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt.hpp b/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt.hpp new file mode 100644 index 0000000..7f1788c --- /dev/null +++ b/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt.hpp @@ -0,0 +1,168 @@ +/** + * @file ada_sqrt.hpp + * @author Marcus Edel + * + * Implementation of the AdaSqrt optimizer. AdaSqrt is an optimizer that + * chooses learning rate dynamically by adapting to the data and iteration. + * + * ensmallen is free software; you may redistribute it and/or modify it under + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with ensmallen. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef ENSMALLEN_ADA_SQRT_ADA_SQRT_HPP +#define ENSMALLEN_ADA_SQRT_ADA_SQRT_HPP + +#include "../sgd/sgd.hpp" +#include "ada_sqrt_update.hpp" + +namespace ens { + +/** + * AdaSqrt is a modified version of stochastic gradient descent which performs + * larger updates for more sparse parameters and smaller updates for less sparse + * parameters. + * + * For more information, see the following. + * + * @code + * @misc{hu2019secondorder, + * title = {Second-order Information in First-order Optimization Methods}, + * author = {Yuzheng Hu and Licong Lin and Shange Tang}, + * year = {2019}, + * eprint = {1912.09926}, + * } + * @endcode + * + * AdaSqrt can optimize differentiable separable functions. For more details, + * see the documentation on function types included with this distribution or on + * the ensmallen website. + */ +class AdaSqrt +{ + public: + /** + * Construct the AdaSqrt optimizer with the given function and parameters. + * The defaults here are not necessarily good for the given problem, so it is + * suggested that the values used be tailored to the task at hand. The + * maximum number of iterations refers to the maximum number of points that + * are processed (i.e., one iteration equals one point; one iteration does not + * equal one pass over the dataset). + * + * @param stepSize Step size for each iteration. + * @param batchSize Number of points to process in one step. + * @param epsilon Value used to initialise the squared gradient parameter. + * @param maxIterations Maximum number of iterations allowed (0 means no + * limit). + * @param tolerance Maximum absolute tolerance to terminate algorithm. + * @param shuffle If true, the function order is shuffled; otherwise, each + * function is visited in linear order. + * @param resetPolicy If true, parameters are reset before every Optimize + * call; otherwise, their values are retained. + * @param exactObjective Calculate the exact objective (Default: estimate the + * final objective obtained on the last pass over the data). + */ + AdaSqrt(const double stepSize = 0.01, + const size_t batchSize = 32, + const double epsilon = 1e-8, + const size_t maxIterations = 100000, + const double tolerance = 1e-5, + const bool shuffle = true, + const bool resetPolicy = true, + const bool exactObjective = false); + + /** + * Optimize the given function using AdaSqrt. The given starting point will + * be modified to store the finishing point of the algorithm, and the final + * objective value is returned. + * + * @tparam SeparableFunctionType Type of the function to be optimized. + * @tparam MatType Type of matrix to optimize with. + * @tparam GradType Type of matrix to use to represent function gradients. + * @tparam CallbackTypes Types of callback functions. + * @param function Function to optimize. + * @param iterate Starting point (will be modified). + * @param callbacks Callback functions. + * @return Objective value of the final point. + */ + template + typename std::enable_if::value, + typename MatType::elem_type>::type + Optimize(SeparableFunctionType& function, + MatType& iterate, + CallbackTypes&&... callbacks) + { + return optimizer.Optimize(function, iterate, + std::forward(callbacks)...); + } + + //! Forward the MatType as GradType. + template + typename MatType::elem_type Optimize(SeparableFunctionType& function, + MatType& iterate, + CallbackTypes&&... callbacks) + { + return Optimize(function, iterate, + std::forward(callbacks)...); + } + + //! Get the step size. + double StepSize() const { return optimizer.StepSize(); } + //! Modify the step size. + double& StepSize() { return optimizer.StepSize(); } + + //! Get the batch size. + size_t BatchSize() const { return optimizer.BatchSize(); } + //! Modify the batch size. + size_t& BatchSize() { return optimizer.BatchSize(); } + + //! Get the value used to initialise the squared gradient parameter. + double Epsilon() const { return optimizer.UpdatePolicy().Epsilon(); } + //! Modify the value used to initialise the squared gradient parameter. + double& Epsilon() { return optimizer.UpdatePolicy().Epsilon(); } + + //! Get the maximum number of iterations (0 indicates no limit). + size_t MaxIterations() const { return optimizer.MaxIterations(); } + //! Modify the maximum number of iterations (0 indicates no limit). + size_t& MaxIterations() { return optimizer.MaxIterations(); } + + //! Get the tolerance for termination. + double Tolerance() const { return optimizer.Tolerance(); } + //! Modify the tolerance for termination. + double& Tolerance() { return optimizer.Tolerance(); } + + //! Get whether or not the individual functions are shuffled. + bool Shuffle() const { return optimizer.Shuffle(); } + //! Modify whether or not the individual functions are shuffled. + bool& Shuffle() { return optimizer.Shuffle(); } + + //! Get whether or not the actual objective is calculated. + bool ExactObjective() const { return optimizer.ExactObjective(); } + //! Modify whether or not the actual objective is calculated. + bool& ExactObjective() { return optimizer.ExactObjective(); } + + //! Get whether or not the update policy parameters + //! are reset before Optimize call. + bool ResetPolicy() const { return optimizer.ResetPolicy(); } + //! Modify whether or not the update policy parameters + //! are reset before Optimize call. + bool& ResetPolicy() { return optimizer.ResetPolicy(); } + + private: + //! The Stochastic Gradient Descent object with AdaSqrt policy. + SGD optimizer; +}; + +} // namespace ens + +// Include implementation. +#include "ada_sqrt_impl.hpp" + +#endif diff --git a/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_impl.hpp b/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_impl.hpp new file mode 100644 index 0000000..1bc36d9 --- /dev/null +++ b/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_impl.hpp @@ -0,0 +1,38 @@ +/** + * @file ada_sqrt_impl.hpp + * @author Marcus Edel + * + * Implementation of AdaSqrt optimizer. + * + * ensmallen is free software; you may redistribute it and/or modify it under + * the terms of the 3-clause BSD license. You should have received a copy of + * the 3-clause BSD license along with ensmallen. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef ENSMALLEN_ADA_SQRT_ADA_SQRT_IMPL_HPP +#define ENSMALLEN_ADA_SQRT_ADA_SQRT_IMPL_HPP + +namespace ens { + +inline AdaSqrt::AdaSqrt(const double stepSize, + const size_t batchSize, + const double epsilon, + const size_t maxIterations, + const double tolerance, + const bool shuffle, + const bool resetPolicy, + const bool exactObjective) : + optimizer(stepSize, + batchSize, + maxIterations, + tolerance, + shuffle, + AdaSqrtUpdate(epsilon), + NoDecay(), + resetPolicy, + exactObjective) +{ /* Nothing to do. */ } + +} // namespace ens + +#endif diff --git a/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_update.hpp b/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_update.hpp new file mode 100644 index 0000000..feae24c --- /dev/null +++ b/inst/include/ensmallen_bits/ada_sqrt/ada_sqrt_update.hpp @@ -0,0 +1,118 @@ +/** + * @file ada_sqrt_update.hpp + * @author Marcus Edel + * + * AdaSqrt update for Stochastic Gradient Descent. + * + * ensmallen is free software; you may redistribute it and/or modify it under + * the terms of the 3-clause BSD license. You should have received a copy of + * the 3-clause BSD license along with ensmallen. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef ENSMALLEN_ADA_SQRT_ADA_SQRT_UPDATE_HPP +#define ENSMALLEN_ADA_SQRT_ADA_SQRT_UPDATE_HPP + +namespace ens { + +/** + * Implementation of the AdaSqrt update policy. AdaSqrt update policy chooses + * learning rate dynamically by adapting to the data and iteration. + * + * For more information, see the following. + * + * @code + * @misc{hu2019secondorder, + * title = {Second-order Information in First-order Optimization Methods}, + * author = {Yuzheng Hu and Licong Lin and Shange Tang}, + * year = {2019}, + * eprint = {1912.09926}, + * } + * @endcode + * + */ +class AdaSqrtUpdate +{ + public: + /** + * Construct the AdaSqrt update policy with given epsilon parameter. + * + * @param epsilon The epsilon value used to initialise the squared gradient + * parameter. + */ + AdaSqrtUpdate(const double epsilon = 1e-8) : epsilon(epsilon) + { + // Nothing to do. + } + + //! Get the value used to initialise the squared gradient parameter. + double Epsilon() const { return epsilon; } + //! Modify the value used to initialise the squared gradient parameter. + double& Epsilon() { return epsilon; } + + /** + * The UpdatePolicyType policy classes must contain an internal 'Policy' + * template class with two template arguments: MatType and GradType. This is + * instantiated at the start of the optimization, and holds parameters + * specific to an individual optimization. + */ + template + class Policy + { + public: + /** + * This constructor is called by the SGD optimizer before the start of the + * iteration update process. In AdaSqrt update policy, squared gradient + * matrix is initialized to the zeros matrix with the same size as gradient + * matrix (see ens::SGD<>). + * + * @param parent Instantiated parent class. + * @param rows Number of rows in the gradient matrix. + * @param cols Number of columns in the gradient matrix. + */ + Policy(AdaSqrtUpdate& parent, const size_t rows, const size_t cols) : + parent(parent), + squaredGradient(rows, cols), + iteration(0) + { + // Initialize an empty matrix for sum of squares of parameter gradient. + squaredGradient.zeros(); + } + + /** + * Update step for SGD. The AdaSqrt update adapts the learning rate by + * performing larger updates for more sparse parameters and smaller updates + * for less sparse parameters. + * + * @param iterate Parameters that minimize the function. + * @param stepSize Step size to be used for the given iteration. + * @param gradient The gradient matrix. + */ + void Update(MatType& iterate, + const double stepSize, + const GradType& gradient) + { + ++iteration; + + squaredGradient += arma::square(gradient); + + iterate -= stepSize * std::sqrt(iteration) * gradient / + (squaredGradient + parent.epsilon); + } + + private: + // Instantiated parent class. + AdaSqrtUpdate& parent; + // The squared gradient matrix. + GradType squaredGradient; + // The number of iterations. + size_t iteration; + }; + + private: + // The epsilon value used to initialise the squared gradient parameter. + double epsilon; +}; + +} // namespace ens + +#endif diff --git a/inst/include/ensmallen_bits/adam/adam_update.hpp b/inst/include/ensmallen_bits/adam/adam_update.hpp index 8831c7b..de7f61e 100644 --- a/inst/include/ensmallen_bits/adam/adam_update.hpp +++ b/inst/include/ensmallen_bits/adam/adam_update.hpp @@ -52,8 +52,7 @@ class AdamUpdate const double beta2 = 0.999) : epsilon(epsilon), beta1(beta1), - beta2(beta2), - iteration(0) + beta2(beta2) { // Nothing to do. } @@ -73,11 +72,6 @@ class AdamUpdate //! Modify the second moment coefficient. double& Beta2() { return beta2; } - //! Get the current iteration number. - size_t Iteration() const { return iteration; } - //! Modify the current iteration number. - size_t& Iteration() { return iteration; } - /** * The UpdatePolicyType policy classes must contain an internal 'Policy' * template class with two template arguments: MatType and GradType. This is @@ -97,7 +91,8 @@ class AdamUpdate * @param cols Number of columns in the gradient matrix. */ Policy(AdamUpdate& parent, const size_t rows, const size_t cols) : - parent(parent) + parent(parent), + iteration(0) { m.zeros(rows, cols); v.zeros(rows, cols); @@ -115,7 +110,7 @@ class AdamUpdate const GradType& gradient) { // Increment the iteration counter variable. - ++parent.iteration; + ++iteration; // And update the iterate. m *= parent.beta1; @@ -124,10 +119,8 @@ class AdamUpdate v *= parent.beta2; v += (1 - parent.beta2) * (gradient % gradient); - const double biasCorrection1 = 1.0 - std::pow(parent.beta1, - parent.iteration); - const double biasCorrection2 = 1.0 - std::pow(parent.beta2, - parent.iteration); + const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration); + const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration); /** * It should be noted that the term, m / (arma::sqrt(v) + eps), in the @@ -147,6 +140,9 @@ class AdamUpdate // The exponential moving average of squared gradient values. GradType v; + + // The number of iterations. + size_t iteration; }; private: @@ -158,9 +154,6 @@ class AdamUpdate // The second moment coefficient. double beta2; - - // The number of iterations. - size_t iteration; }; } // namespace ens diff --git a/inst/include/ensmallen_bits/adam/adamax_update.hpp b/inst/include/ensmallen_bits/adam/adamax_update.hpp index 38ef910..a6c9f2f 100644 --- a/inst/include/ensmallen_bits/adam/adamax_update.hpp +++ b/inst/include/ensmallen_bits/adam/adamax_update.hpp @@ -54,8 +54,7 @@ class AdaMaxUpdate const double beta2 = 0.999) : epsilon(epsilon), beta1(beta1), - beta2(beta2), - iteration(0) + beta2(beta2) { // Nothing to do. } @@ -75,11 +74,6 @@ class AdaMaxUpdate //! Modify the second moment coefficient. double& Beta2() { return beta2; } - //! Get the current iteration number. - size_t Iteration() const { return iteration; } - //! Modify the current iteration number. - size_t& Iteration() { return iteration; } - /** * The UpdatePolicyType policy classes must contain an internal 'Policy' * template class with two template arguments: MatType and GradType. This is @@ -99,7 +93,8 @@ class AdaMaxUpdate * @param cols Number of columns in the gradient matrix. */ Policy(AdaMaxUpdate& parent, const size_t rows, const size_t cols) : - parent(parent) + parent(parent), + iteration(0) { m.zeros(rows, cols); u.zeros(rows, cols); @@ -117,7 +112,7 @@ class AdaMaxUpdate const GradType& gradient) { // Increment the iteration counter variable. - ++parent.iteration; + ++iteration; // And update the iterate. m *= parent.beta1; @@ -127,8 +122,7 @@ class AdaMaxUpdate u *= parent.beta2; u = arma::max(u, arma::abs(gradient)); - const double biasCorrection1 = 1.0 - std::pow(parent.beta1, - parent.iteration); + const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration); if (biasCorrection1 != 0) iterate -= (stepSize / biasCorrection1 * m / (u + parent.epsilon)); @@ -141,6 +135,8 @@ class AdaMaxUpdate GradType m; // The exponentially weighted infinity norm. GradType u; + // The number of iterations. + size_t iteration; }; private: @@ -152,9 +148,6 @@ class AdaMaxUpdate // The second moment coefficient. double beta2; - - // The number of iterations. - size_t iteration; }; } // namespace ens diff --git a/inst/include/ensmallen_bits/adam/amsgrad_update.hpp b/inst/include/ensmallen_bits/adam/amsgrad_update.hpp index d49314f..f1f420e 100644 --- a/inst/include/ensmallen_bits/adam/amsgrad_update.hpp +++ b/inst/include/ensmallen_bits/adam/amsgrad_update.hpp @@ -47,8 +47,7 @@ class AMSGradUpdate const double beta2 = 0.999) : epsilon(epsilon), beta1(beta1), - beta2(beta2), - iteration(0) + beta2(beta2) { // Nothing to do. } @@ -68,11 +67,6 @@ class AMSGradUpdate //! Modify the second moment coefficient. double& Beta2() { return beta2; } - //! Get the current iteration number. - size_t Iteration() const { return iteration; } - //! Modify the current iteration number. - size_t& Iteration() { return iteration; } - /** * The UpdatePolicyType policy classes must contain an internal 'Policy' * template class with two template arguments: MatType and GradType. This is @@ -92,7 +86,8 @@ class AMSGradUpdate * @param cols Number of columns in the gradient matrix. */ Policy(AMSGradUpdate& parent, const size_t rows, const size_t cols) : - parent(parent) + parent(parent), + iteration(0) { m.zeros(rows, cols); v.zeros(rows, cols); @@ -111,7 +106,7 @@ class AMSGradUpdate const GradType& gradient) { // Increment the iteration counter variable. - ++parent.iteration; + ++iteration; // And update the iterate. m *= parent.beta1; @@ -120,10 +115,8 @@ class AMSGradUpdate v *= parent.beta2; v += (1 - parent.beta2) * (gradient % gradient); - const double biasCorrection1 = 1.0 - std::pow(parent.beta1, - parent.iteration); - const double biasCorrection2 = 1.0 - std::pow(parent.beta2, - parent.iteration); + const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration); + const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration); // Element wise maximum of past and present squared gradients. vImproved = arma::max(vImproved, v); @@ -144,6 +137,9 @@ class AMSGradUpdate // The optimal squared gradient value. GradType vImproved; + + // The number of iterations. + size_t iteration; }; private: @@ -155,9 +151,6 @@ class AMSGradUpdate // The second moment coefficient. double beta2; - - // The number of iterations. - size_t iteration; }; } // namespace ens diff --git a/inst/include/ensmallen_bits/adam/nadam_update.hpp b/inst/include/ensmallen_bits/adam/nadam_update.hpp index 880de5a..24f105c 100644 --- a/inst/include/ensmallen_bits/adam/nadam_update.hpp +++ b/inst/include/ensmallen_bits/adam/nadam_update.hpp @@ -50,8 +50,7 @@ class NadamUpdate epsilon(epsilon), beta1(beta1), beta2(beta2), - scheduleDecay(scheduleDecay), - iteration(0) + scheduleDecay(scheduleDecay) { // Nothing to do. } @@ -76,11 +75,6 @@ class NadamUpdate //! Modify the decay parameter for decay coefficients double& ScheduleDecay() { return scheduleDecay; } - //! Get the current iteration number. - size_t Iteration() const { return iteration; } - //! Modify the current iteration number. - size_t& Iteration() { return iteration; } - /** * The UpdatePolicyType policy classes must contain an internal 'Policy' * template class with two template arguments: MatType and GradType. This is @@ -101,7 +95,8 @@ class NadamUpdate */ Policy(NadamUpdate& parent, const size_t rows, const size_t cols) : parent(parent), - cumBeta1(1) + cumBeta1(1), + iteration(0) { m.zeros(rows, cols); v.zeros(rows, cols); @@ -119,7 +114,7 @@ class NadamUpdate const GradType& gradient) { // Increment the iteration counter variable. - ++parent.iteration; + ++iteration; // And update the iterate. m *= parent.beta1; @@ -129,18 +124,15 @@ class NadamUpdate v += (1 - parent.beta2) * gradient % gradient; double beta1T = parent.beta1 * (1 - (0.5 * - std::pow(0.96, parent.iteration * parent.scheduleDecay))); + std::pow(0.96, iteration * parent.scheduleDecay))); double beta1T1 = parent.beta1 * (1 - (0.5 * - std::pow(0.96, (parent.iteration + 1) * parent.scheduleDecay))); + std::pow(0.96, (iteration + 1) * parent.scheduleDecay))); cumBeta1 *= beta1T; const double biasCorrection1 = 1.0 - cumBeta1; - - const double biasCorrection2 = 1.0 - std::pow(parent.beta2, - parent.iteration); - + const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration); const double biasCorrection3 = 1.0 - (cumBeta1 * beta1T1); /* Note :- arma::sqrt(v) + epsilon * sqrt(biasCorrection2) is approximated @@ -163,6 +155,9 @@ class NadamUpdate // The cumulative product of decay coefficients. double cumBeta1; + + // The number of iterations. + size_t iteration; }; private: @@ -177,9 +172,6 @@ class NadamUpdate // The decay parameter for decay coefficients. double scheduleDecay; - - // The number of iterations. - size_t iteration; }; } // namespace ens diff --git a/inst/include/ensmallen_bits/adam/nadamax_update.hpp b/inst/include/ensmallen_bits/adam/nadamax_update.hpp index c5e72f8..f0d9b0c 100644 --- a/inst/include/ensmallen_bits/adam/nadamax_update.hpp +++ b/inst/include/ensmallen_bits/adam/nadamax_update.hpp @@ -50,8 +50,7 @@ class NadaMaxUpdate epsilon(epsilon), beta1(beta1), beta2(beta2), - scheduleDecay(scheduleDecay), - iteration(0) + scheduleDecay(scheduleDecay) { // Nothing to do. } @@ -76,11 +75,6 @@ class NadaMaxUpdate //! Modify the decay parameter for decay coefficients double& ScheduleDecay() { return scheduleDecay; } - //! Get the current iteration number. - size_t Iteration() const { return iteration; } - //! Modify the current iteration number. - size_t& Iteration() { return iteration; } - /** * The UpdatePolicyType policy classes must contain an internal 'Policy' * template class with two template arguments: MatType and GradType. This is @@ -101,7 +95,8 @@ class NadaMaxUpdate */ Policy(NadaMaxUpdate& parent, const size_t rows, const size_t cols) : parent(parent), - cumBeta1(1) + cumBeta1(1), + iteration(0) { m.zeros(rows, cols); u.zeros(rows, cols); @@ -119,7 +114,7 @@ class NadaMaxUpdate const GradType& gradient) { // Increment the iteration counter variable. - ++parent.iteration; + ++iteration; // And update the iterate. m *= parent.beta1; @@ -128,10 +123,10 @@ class NadaMaxUpdate u = arma::max(u * parent.beta2, arma::abs(gradient)); double beta1T = parent.beta1 * (1 - (0.5 * - std::pow(0.96, parent.iteration * parent.scheduleDecay))); + std::pow(0.96, iteration * parent.scheduleDecay))); double beta1T1 = parent.beta1 * (1 - (0.5 * - std::pow(0.96, (parent.iteration + 1) * parent.scheduleDecay))); + std::pow(0.96, (iteration + 1) * parent.scheduleDecay))); cumBeta1 *= beta1T; @@ -158,6 +153,9 @@ class NadaMaxUpdate // The cumulative product of decay coefficients. double cumBeta1; + + // The number of iterations. + size_t iteration; }; private: @@ -172,9 +170,6 @@ class NadaMaxUpdate // The decay parameter for decay coefficients. double scheduleDecay; - - // The number of iterations. - size_t iteration; }; } // namespace ens diff --git a/inst/include/ensmallen_bits/adam/optimisticadam_update.hpp b/inst/include/ensmallen_bits/adam/optimisticadam_update.hpp index 7fa9fbb..426a5bb 100644 --- a/inst/include/ensmallen_bits/adam/optimisticadam_update.hpp +++ b/inst/include/ensmallen_bits/adam/optimisticadam_update.hpp @@ -51,8 +51,7 @@ class OptimisticAdamUpdate const double beta2 = 0.999) : epsilon(epsilon), beta1(beta1), - beta2(beta2), - iteration(0) + beta2(beta2) { // Nothing to do. } @@ -72,11 +71,6 @@ class OptimisticAdamUpdate //! Modify the second moment coefficient. double& Beta2() { return beta2; } - //! Get the current iteration number. - size_t Iteration() const { return iteration; } - //! Modify the current iteration number. - size_t& Iteration() { return iteration; } - /** * The UpdatePolicyType policy classes must contain an internal 'Policy' * template class with two template arguments: MatType and GradType. This is @@ -96,7 +90,8 @@ class OptimisticAdamUpdate * @param cols Number of columns in the gradient matrix. */ Policy(OptimisticAdamUpdate& parent, const size_t rows, const size_t cols) : - parent(parent) + parent(parent), + iteration(0) { m.zeros(rows, cols); v.zeros(rows, cols); @@ -115,7 +110,7 @@ class OptimisticAdamUpdate const GradType& gradient) { // Increment the iteration counter variable. - ++parent.iteration; + ++iteration; // And update the iterate. m *= parent.beta1; @@ -124,13 +119,10 @@ class OptimisticAdamUpdate v *= parent.beta2; v += (1 - parent.beta2) * arma::square(gradient); - GradType mCorrected = m / (1.0 - std::pow(parent.beta1, - parent.iteration)); - GradType vCorrected = v / (1.0 - std::pow(parent.beta2, - parent.iteration)); + GradType mCorrected = m / (1.0 - std::pow(parent.beta1, iteration)); + GradType vCorrected = v / (1.0 - std::pow(parent.beta2, iteration)); - GradType update = mCorrected / - (arma::sqrt(vCorrected) + parent.epsilon); + GradType update = mCorrected / (arma::sqrt(vCorrected) + parent.epsilon); iterate -= (2 * stepSize * update - stepSize * g); @@ -149,6 +141,9 @@ class OptimisticAdamUpdate // The previous update. GradType g; + + // The number of iterations. + size_t iteration; }; private: @@ -160,9 +155,6 @@ class OptimisticAdamUpdate // The second moment coefficient. double beta2; - - // The number of iterations. - size_t iteration; }; } // namespace ens diff --git a/inst/include/ensmallen_bits/demon_adam/demon_adam.hpp b/inst/include/ensmallen_bits/demon_adam/demon_adam.hpp new file mode 100644 index 0000000..e524531 --- /dev/null +++ b/inst/include/ensmallen_bits/demon_adam/demon_adam.hpp @@ -0,0 +1,207 @@ +/** + * @file demon_adam.hpp + * @author Marcus Edel + * + * Definition of DemonAdam. + * + * ensmallen is free software; you may redistribute it and/or modify it under + * the terms of the 3-clause BSD license. You should have received a copy of + * the 3-clause BSD license along with ensmallen. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef ENSMALLEN_DEMON_ADAM_DEMON_ADAM_HPP +#define ENSMALLEN_DEMON_ADAM_DEMON_ADAM_HPP + +#include "../sgd/sgd.hpp" +#include "../adam/adam_update.hpp" +#include "../adam/adamax_update.hpp" +#include "../adam/amsgrad_update.hpp" +#include "../adam/nadam_update.hpp" +#include "../adam/nadamax_update.hpp" +#include "../adam/optimisticadam_update.hpp" +#include "demon_adam_update.hpp" + +namespace ens { + +/** + * DemonAdam automatically decays momentum, motivated by decaying the total + * contribution of a gradient to all future updates. + * + * For more information, see the following. + * + * @code + * @misc{ + * title = {Decaying momentum helps neural network training}, + * author = {John Chen and Cameron Wolfe and Zhao Li + * and Anastasios Kyrillidis}, + * url = {https://arxiv.org/abs/1910.04952} + * year = {2019} + * } + * + * DemonAdam can optimize differentiable separable functions. For more details, + * see the documentation on function types include with this distribution or on + * the ensmallen website. + * + * @tparam UpdateRule Adam optimizer update rule to be used. + */ +template +class DemonAdamType +{ + public: + /** + * Construct the DemonAdam optimizer with the given function and parameters. + * The defaults here are not necessarily good for the given problem, so it is + * suggested that the values used be tailored to the task at hand. The + * maximum number of iterations refers to the maximum number of points that + * are processed (i.e., one iteration equals one point; one iteration does not + * equal one pass over the dataset). + * + * @param stepSize Step size for each iteration. + * @param batchSize Number of points to process in a single step. + * @param momentum The initial momentum coefficient. + * @param maxIterations Maximum number of iterations allowed (0 means no + * limit). + * @param beta1 Exponential decay rate for the first moment estimates. + * @param beta2 Exponential decay rate for the weighted infinity norm + * estimates. + * @param eps Value used to initialise the mean squared gradient parameter. + * @param tolerance Maximum absolute tolerance to terminate algorithm. + * @param shuffle If true, the function order is shuffled; otherwise, each + * function is visited in linear order. + * @param resetPolicy If true, parameters are reset before every Optimize + * call; otherwise, their values are retained. + * @param exactObjective Calculate the exact objective (Default: estimate the + * final objective obtained on the last pass over the data). + */ + DemonAdamType(const double stepSize = 0.001, + const size_t batchSize = 32, + const double momentum = 0.9, + const double beta1 = 0.9, + const double beta2 = 0.999, + const double eps = 1e-8, + const size_t maxIterations = 100000, + const double tolerance = 1e-5, + const bool shuffle = true, + const bool resetPolicy = true, + const bool exactObjective = false) : + optimizer(stepSize, + batchSize, + maxIterations, + tolerance, + shuffle, + DemonAdamUpdate(maxIterations * batchSize, + momentum, UpdateRule(eps, beta1, beta2)), + NoDecay(), + resetPolicy, + exactObjective) + { /* Nothing to do here. */ } + + /** + * Optimize the given function using DemonAdam. The given starting point will + * be modified to store the finishing point of the algorithm, and the final + * objective value is returned. + * + * @tparam SeparableFunctionType Type of the function to optimize. + * @tparam MatType Type of matrix to optimize with. + * @tparam GradType Type of matrix to use to represent function gradients. + * @tparam CallbackTypes Types of callback functions. + * @param function Function to optimize. + * @param iterate Starting point (will be modified). + * @param callbacks Callback functions. + * @return Objective value of the final point. + */ + template + typename MatType::elem_type Optimize(SeparableFunctionType& function, + MatType& iterate, + CallbackTypes&&... callbacks) + { + return optimizer.template Optimize< + SeparableFunctionType, MatType, GradType, CallbackTypes...>( + function, iterate, std::forward(callbacks)...); + } + + //! Forward the MatType as GradType. + template + typename MatType::elem_type Optimize(SeparableFunctionType& function, + MatType& iterate, + CallbackTypes&&... callbacks) + { + return Optimize(function, iterate, + std::forward(callbacks)...); + } + + //! Get the step size. + double StepSize() const { return optimizer.StepSize(); } + //! Modify the step size. + double& StepSize() { return optimizer.StepSize(); } + + //! Get the batch size. + size_t BatchSize() const { return optimizer.BatchSize(); } + //! Modify the batch size. + size_t& BatchSize() { return optimizer.BatchSize(); } + + //! Get the moment coefficient. + double Momentum() const { return optimizer.UpdatePolicy().Momentum(); } + //! Modify the moment coefficient. + double& Momentum() { return optimizer.UpdatePolicy().Momentum(); } + + //! Get the momentum iteration number. + size_t MomentumIterations() const + { return optimizer.UpdatePolicy().MomentumIterations(); } + //! Modify the momentum iteration number. + size_t& MomentumIterations() + { return optimizer.UpdatePolicy().MomentumIterations(); } + + //! Get the maximum number of iterations (0 indicates no limit). + size_t MaxIterations() const { return optimizer.MaxIterations(); } + //! Modify the maximum number of iterations (0 indicates no limit). + size_t& MaxIterations() { return optimizer.MaxIterations(); } + + //! Get the tolerance for termination. + double Tolerance() const { return optimizer.Tolerance(); } + //! Modify the tolerance for termination. + double& Tolerance() { return optimizer.Tolerance(); } + + //! Get whether or not the individual functions are shuffled. + bool Shuffle() const { return optimizer.Shuffle(); } + //! Modify whether or not the individual functions are shuffled. + bool& Shuffle() { return optimizer.Shuffle(); } + + //! Get whether or not the actual objective is calculated. + bool ExactObjective() const { return optimizer.ExactObjective(); } + //! Modify whether or not the actual objective is calculated. + bool& ExactObjective() { return optimizer.ExactObjective(); } + + //! Get whether or not the update policy parameters + //! are reset before Optimize call. + bool ResetPolicy() const { return optimizer.ResetPolicy(); } + //! Modify whether or not the update policy parameters + //! are reset before Optimize call. + bool& ResetPolicy() { return optimizer.ResetPolicy(); } + + private: + //! The Stochastic Gradient Descent object with DemonAdam policy. + SGD> optimizer; +}; + +using DemonAdam = DemonAdamType; + +using DemonAdaMax = DemonAdamType; + +using DemonAMSGrad = DemonAdamType; + +using DemonNadam = DemonAdamType; + +using DemonNadaMax = DemonAdamType; + +using DemonOptimisticAdam = DemonAdamType; + +} // namespace ens + +#endif diff --git a/inst/include/ensmallen_bits/demon_adam/demon_adam_update.hpp b/inst/include/ensmallen_bits/demon_adam/demon_adam_update.hpp new file mode 100644 index 0000000..47f6b36 --- /dev/null +++ b/inst/include/ensmallen_bits/demon_adam/demon_adam_update.hpp @@ -0,0 +1,169 @@ +/** + * @file demon_sgd_update.hpp + * @author Marcus Edel + * + * Implementation of DemonAdam. + * + * ensmallen is free software; you may redistribute it and/or modify it under + * the terms of the 3-clause BSD license. You should have received a copy of + * the 3-clause BSD license along with ensmallen. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef ENSMALLEN_DEMON_ADAM_DEMON_ADAM_UPDATE_HPP +#define ENSMALLEN_DEMON_ADAM_DEMON_ADAM_UPDATE_HPP + +#include + +namespace ens { + +/** + * DemonAdam automatically decays momentum, motivated by decaying the total + * contribution of a gradient to all future updates. + * + * For more information, see the following. + * + * @code + * @misc{ + * title = {Decaying momentum helps neural network training}, + * author = {John Chen and Cameron Wolfe and Zhao Li + * and Anastasios Kyrillidis}, + * url = {https://arxiv.org/abs/1910.04952} + * year = {2019} + * } + * @endcode + * + * @tparam UpdateRule DemonAdam optimizer update rule to be used. + */ +template +class DemonAdamUpdate +{ + public: + /** + * Construct the DemonAdam update policy with the given parameters. + * + * @param momentumIterations The number of iterations before the momentum + * will decay to zero. + * @param momentum The initial momentum coefficient. + * @param adamUpdate Instantiated Adam update policy used to adjust the given + * parameters. + */ + DemonAdamUpdate(const size_t momentumIterations = 100, + const double momentum = 0.9, + const UpdateRule& adamUpdate = UpdateRule()) : + T(momentumIterations), + betaInit(momentum), + t(0), + adamUpdateInst(adamUpdate) + { + // Make sure the momentum iterations parameter is non-zero. + assert(momentumIterations != 0 && "The number of iterations before the " + "momentum will decay is zero, make sure the max iterations and " + "batch size parameter is set correctly. " + "Default: momentumIterations = maxIterations / batchSize."); + } + + //! Get the momentum coefficient. + double Momentum() const { return betaInit; } + //! Modify the momentum coefficient. + double& Momentum() { return betaInit; } + + //! Get the current iteration number. + size_t Iteration() const { return t; } + //! Modify the current iteration number. + size_t& Iteration() { return t; } + + //! Get the momentum ion number. + size_t MomentumIterations() const { return T; } + //! Modify the momentum iteration number. + size_t& MomentumIterations() { return T; } + + /** + * The UpdatePolicyType policy classes must contain an internal 'Policy' + * template class with two template arguments: MatType and GradType. This is + * instantiated at the start of the optimization, and holds parameters + * specific to an individual optimization. + */ + template + class Policy + { + public: + // Convenient typedef. + typedef typename UpdateRule::template Policy + InstUpdateRuleType; + + /** + * This constructor is called by the SGD Optimize() method before the start + * of the iteration update process. + * + * @param parent Instantiated PadamUpdate parent object. + * @param rows Number of rows in the gradient matrix. + * @param cols Number of columns in the gradient matrix. + */ + Policy(DemonAdamUpdate& parent, + const size_t rows, + const size_t cols) : + parent(parent), + adamUpdate(new InstUpdateRuleType(parent.adamUpdateInst, rows, cols)) + { /* Nothing to do here */ } + + /** + * Clean any memory associated with the Polciy object. + */ + ~Policy() + { + delete adamUpdate; + } + + /** + * Update step for DamonAdam. + * + * @param iterate Parameters that minimize the function. + * @param stepSize Step size to be used for the given iteration. + * @param gradient The gradient matrix. + */ + void Update(MatType& iterate, + const double stepSize, + const GradType& gradient) + { + double decayRate = 1; + if (parent.t > 0) + decayRate = 1.0 - (double) parent.t / (double) parent.T; + + const double betaDecay = parent.betaInit * decayRate; + const double beta = betaDecay / ((1.0 - parent.betaInit) + betaDecay); + + // Perform the update. + iterate *= beta; + + // Apply the adam update. + adamUpdate->Update(iterate, stepSize, gradient); + + // Increment the iteration counter variable. + ++parent.t; + } + + private: + //! Instantiated parent object. + DemonAdamUpdate& parent; + + //! The update policy. + InstUpdateRuleType* adamUpdate; + }; + + private: + //! The number of momentum iterations. + size_t T; + + //! Initial momentum coefficient. + double betaInit; + + //! The number of iterations. + size_t t; + + //! The adam update policy. + UpdateRule adamUpdateInst; +}; + +} // namespace ens + +#endif diff --git a/inst/include/ensmallen_bits/demon_sgd/demon_sgd.hpp b/inst/include/ensmallen_bits/demon_sgd/demon_sgd.hpp new file mode 100644 index 0000000..ddbf1d2 --- /dev/null +++ b/inst/include/ensmallen_bits/demon_sgd/demon_sgd.hpp @@ -0,0 +1,178 @@ +/** + * @file demon_sgd.hpp + * @author Marcus Edel + * + * Definition of DemonSGD. + * + * ensmallen is free software; you may redistribute it and/or modify it under + * the terms of the 3-clause BSD license. You should have received a copy of + * the 3-clause BSD license along with ensmallen. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef ENSMALLEN_DEMON_SGD_DEMON_SGD_HPP +#define ENSMALLEN_DEMON_SGD_DEMON_SGD_HPP + +#include "../sgd/sgd.hpp" +#include "demon_sgd_update.hpp" + +namespace ens { + +/** + * DemonSGD automatically decays momentum, motivated by decaying the total + * contribution of a gradient to all future updates. + * + * For more information, see the following. + * + * @code + * @misc{ + * title = {Decaying momentum helps neural network training}, + * author = {John Chen and Cameron Wolfe and Zhao Li + * and Anastasios Kyrillidis}, + * url = {https://arxiv.org/abs/1910.04952} + * year = {2019} + * } + * + * DemonSGD can optimize differentiable separable functions. For more details, + * see the documentation on function types include with this distribution or on + * the ensmallen website. + */ +class DemonSGD +{ + public: + /** + * Construct the DemonSGD optimizer with the given function and parameters. + * The defaults here are not necessarily good for the given problem, so it is + * suggested that the values used be tailored to the task at hand. The + * maximum number of iterations refers to the maximum number of points that + * are processed (i.e., one iteration equals one point; one iteration does not + * equal one pass over the dataset). + * + * @param stepSize Step size for each iteration. + * @param batchSize Number of points to process in a single step. + * @param momentum The initial momentum coefficient. + * @param maxIterations Maximum number of iterations allowed (0 means no + * limit). + * @param tolerance Maximum absolute tolerance to terminate algorithm. + * @param shuffle If true, the function order is shuffled; otherwise, each + * function is visited in linear order. + * @param resetPolicy If true, parameters are reset before every Optimize + * call; otherwise, their values are retained. + * @param exactObjective Calculate the exact objective (Default: estimate the + * final objective obtained on the last pass over the data). + */ + DemonSGD(const double stepSize = 0.001, + const size_t batchSize = 32, + const double momentum = 0.9, + const size_t maxIterations = 100000, + const double tolerance = 1e-5, + const bool shuffle = true, + const bool resetPolicy = true, + const bool exactObjective = false) : + optimizer(stepSize, + batchSize, + maxIterations, + tolerance, + shuffle, + DemonSGDUpdate(maxIterations * batchSize, momentum), + NoDecay(), + resetPolicy, + exactObjective) + { /* Nothing to do here. */ } + + /** + * Optimize the given function using DemonSGD. The given starting point will + * be modified to store the finishing point of the algorithm, and the final + * objective value is returned. + * + * @tparam SeparableFunctionType Type of the function to optimize. + * @tparam MatType Type of matrix to optimize with. + * @tparam GradType Type of matrix to use to represent function gradients. + * @tparam CallbackTypes Types of callback functions. + * @param function Function to optimize. + * @param iterate Starting point (will be modified). + * @param callbacks Callback functions. + * @return Objective value of the final point. + */ + template + typename MatType::elem_type Optimize(SeparableFunctionType& function, + MatType& iterate, + CallbackTypes&&... callbacks) + { + return optimizer.template Optimize< + SeparableFunctionType, MatType, GradType, CallbackTypes...>( + function, iterate, std::forward(callbacks)...); + } + + //! Forward the MatType as GradType. + template + typename MatType::elem_type Optimize(SeparableFunctionType& function, + MatType& iterate, + CallbackTypes&&... callbacks) + { + return Optimize(function, iterate, + std::forward(callbacks)...); + } + + //! Get the step size. + double StepSize() const { return optimizer.StepSize(); } + //! Modify the step size. + double& StepSize() { return optimizer.StepSize(); } + + //! Get the batch size. + size_t BatchSize() const { return optimizer.BatchSize(); } + //! Modify the batch size. + size_t& BatchSize() { return optimizer.BatchSize(); } + + //! Get the moment coefficient. + double Momentum() const { return optimizer.UpdatePolicy().Momentum(); } + //! Modify the moment coefficient. + double& Momentum() { return optimizer.UpdatePolicy().Momentum(); } + + //! Get the momentum iteration number. + size_t MomentumIterations() const + { return optimizer.UpdatePolicy().MomentumIterations(); } + //! Modify the momentum iteration number. + size_t& MomentumIterations() + { return optimizer.UpdatePolicy().MomentumIterations(); } + + //! Get the maximum number of iterations (0 indicates no limit). + size_t MaxIterations() const { return optimizer.MaxIterations(); } + //! Modify the maximum number of iterations (0 indicates no limit). + size_t& MaxIterations() { return optimizer.MaxIterations(); } + + //! Get the tolerance for termination. + double Tolerance() const { return optimizer.Tolerance(); } + //! Modify the tolerance for termination. + double& Tolerance() { return optimizer.Tolerance(); } + + //! Get whether or not the individual functions are shuffled. + bool Shuffle() const { return optimizer.Shuffle(); } + //! Modify whether or not the individual functions are shuffled. + bool& Shuffle() { return optimizer.Shuffle(); } + + //! Get whether or not the actual objective is calculated. + bool ExactObjective() const { return optimizer.ExactObjective(); } + //! Modify whether or not the actual objective is calculated. + bool& ExactObjective() { return optimizer.ExactObjective(); } + + //! Get whether or not the update policy parameters + //! are reset before Optimize call. + bool ResetPolicy() const { return optimizer.ResetPolicy(); } + //! Modify whether or not the update policy parameters + //! are reset before Optimize call. + bool& ResetPolicy() { return optimizer.ResetPolicy(); } + + private: + //! The Stochastic Gradient Descent object with DemonSGD policy. + SGD optimizer; +}; + +} // namespace ens + +#endif diff --git a/inst/include/ensmallen_bits/demon_sgd/demon_sgd_update.hpp b/inst/include/ensmallen_bits/demon_sgd/demon_sgd_update.hpp new file mode 100644 index 0000000..dc8b7c5 --- /dev/null +++ b/inst/include/ensmallen_bits/demon_sgd/demon_sgd_update.hpp @@ -0,0 +1,139 @@ +/** + * @file demon_sgd_update.hpp + * @author Marcus Edel + * + * Implementation of DemonSGD. + * + * ensmallen is free software; you may redistribute it and/or modify it under + * the terms of the 3-clause BSD license. You should have received a copy of + * the 3-clause BSD license along with ensmallen. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef ENSMALLEN_DEMON_SGD_DEMON_SGD_UPDATE_HPP +#define ENSMALLEN_DEMON_SGD_DEMON_SGD_UPDATE_HPP + +namespace ens { + +/** + * DemonSGD automatically decays momentum, motivated by decaying the total + * contribution of a gradient to all future updates. + * + * For more information, see the following. + * + * @code + * @misc{ + * title = {Decaying momentum helps neural network training}, + * author = {John Chen and Cameron Wolfe and Zhao Li + * and Anastasios Kyrillidis}, + * url = {https://arxiv.org/abs/1910.04952} + * year = {2019} + * } + * @endcode + */ +class DemonSGDUpdate +{ + public: + /** + * Construct the DemonSGD update policy with the given parameters. + * + * @param momentumIterations The number of iterations before the momentum + * will decay to zero. + * @param momentum The initial momentum coefficient. + */ + DemonSGDUpdate(const size_t momentumIterations = 100, + const double momentum = 0.9) : + T(momentumIterations), + betaInit(momentum), + t(0) + { + // Make sure the momentum iterations parameter is non-zero. + assert(momentumIterations != 0 && "The number of iterations before the " + "momentum will decay is zero, make sure the max iterations and " + "batch size parameter is set correctly. " + "Default: momentumIterations = maxIterations * batchSize."); + } + + //! Get the momentum coefficient. + double Momentum() const { return betaInit; } + //! Modify the momentum coefficient. + double& Momentum() { return betaInit; } + + //! Get the current iteration number. + size_t Iteration() const { return t; } + //! Modify the current iteration number. + size_t& Iteration() { return t; } + + //! Get the momentum iteration number. + size_t MomentumIterations() const { return T; } + //! Modify the momentum iteration number. + size_t& MomentumIterations() { return T; } + + /** + * The UpdatePolicyType policy classes must contain an internal 'Policy' + * template class with two template arguments: MatType and GradType. This is + * instantiated at the start of the optimization, and holds parameters + * specific to an individual optimization. + */ + template + class Policy + { + public: + /** + * This constructor is called by the SGD Optimize() method before the start + * of the iteration update process. + * + * @param parent Instantiated PadamUpdate parent object. + * @param rows Number of rows in the gradient matrix. + * @param cols Number of columns in the gradient matrix. + */ + Policy(DemonSGDUpdate& parent, + const size_t /* rows */, + const size_t /* cols */) : + parent(parent) + { /* Nothing to do here */ } + + /** + * Update step for DemonSGD. + * + * @param iterate Parameters that minimize the function. + * @param stepSize Step size to be used for the given iteration. + * @param gradient The gradient matrix. + */ + void Update(MatType& iterate, + const double stepSize, + const GradType& gradient) + { + double decayRate = 1; + if (parent.t > 0) + decayRate = 1.0 - (double) parent.t / (double) parent.T; + + const double betaDecay = parent.betaInit * decayRate; + const double beta = betaDecay / ((1.0 - parent.betaInit) + betaDecay); + + // Perform the update. + iterate *= beta; + iterate -= stepSize * gradient; + + // Increment the iteration counter variable. + ++parent.t; + } + + private: + //! Instantiated parent object. + DemonSGDUpdate& parent; + }; + + private: + //! The number of momentum iterations. + size_t T; + + //! Initial momentum coefficient. + double betaInit; + + //! The number of iterations. + size_t t; +}; + +} // namespace ens + +#endif diff --git a/inst/include/ensmallen_bits/ens_version.hpp b/inst/include/ensmallen_bits/ens_version.hpp index 530d64c..185ed3f 100644 --- a/inst/include/ensmallen_bits/ens_version.hpp +++ b/inst/include/ensmallen_bits/ens_version.hpp @@ -15,17 +15,17 @@ #define ENS_VERSION_MAJOR 2 // The minor version is two digits so regular numerical comparisons of versions // work right. The first minor version of a release is always 10. -#define ENS_VERSION_MINOR 18 -#define ENS_VERSION_PATCH 2 +#define ENS_VERSION_MINOR 19 +#define ENS_VERSION_PATCH 0 // If this is a release candidate, it will be reflected in the version name // (i.e. the version name will be "RC1", "RC2", etc.). Otherwise the version // name will typically be a seemingly arbitrary set of words that does not // contain the capitalized string "RC". -#define ENS_VERSION_NAME "Fairmount Bagel" +#define ENS_VERSION_NAME "Eight Ball Deluxe" // Incorporate the date the version was released. #define ENS_VERSION_YEAR "2022" -#define ENS_VERSION_MONTH "02" -#define ENS_VERSION_DAY "13" +#define ENS_VERSION_MONTH "04" +#define ENS_VERSION_DAY "06" namespace ens { diff --git a/inst/include/ensmallen_bits/ftml/ftml_update.hpp b/inst/include/ensmallen_bits/ftml/ftml_update.hpp index 7a5ca1c..5db2b05 100644 --- a/inst/include/ensmallen_bits/ftml/ftml_update.hpp +++ b/inst/include/ensmallen_bits/ftml/ftml_update.hpp @@ -50,8 +50,7 @@ class FTMLUpdate const double beta2 = 0.999) : epsilon(epsilon), beta1(beta1), - beta2(beta2), - iteration(0) + beta2(beta2) { /* Do nothing. */ } //! Get the value used to initialise the squared gradient parameter. @@ -69,11 +68,6 @@ class FTMLUpdate //! Modify the second moment coefficient. double& Beta2() { return beta2; } - //! Get the current iteration number. - size_t Iteration() const { return iteration; } - //! Modify the current iteration number. - size_t& Iteration() { return iteration; } - /** * The UpdatePolicyType policy classes must contain an internal 'Policy' * template class with two template arguments: MatType and GradType. This is @@ -112,16 +106,14 @@ class FTMLUpdate const GradType& gradient) { // Increment the iteration counter variable. - ++parent.iteration; + ++iteration; // And update the iterate. v *= parent.beta2; v += (1 - parent.beta2) * (gradient % gradient); - const double biasCorrection1 = 1.0 - std::pow(parent.beta1, - parent.iteration); - const double biasCorrection2 = 1.0 - std::pow(parent.beta2, - parent.iteration); + const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration); + const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration); MatType sigma = -parent.beta1 * d; d = biasCorrection1 / stepSize * @@ -145,6 +137,9 @@ class FTMLUpdate // Parameter update term. MatType d; + + // The number of iterations. + size_t iteration; }; private: @@ -156,9 +151,6 @@ class FTMLUpdate // The second moment coefficient. double beta2; - - // The number of iterations. - size_t iteration; }; } // namespace ens diff --git a/inst/include/ensmallen_bits/fw/proximal/proximal_impl.hpp b/inst/include/ensmallen_bits/fw/proximal/proximal_impl.hpp index 289f006..f607c71 100644 --- a/inst/include/ensmallen_bits/fw/proximal/proximal_impl.hpp +++ b/inst/include/ensmallen_bits/fw/proximal/proximal_impl.hpp @@ -53,7 +53,7 @@ inline void Proximal::ProjectToL1Ball(MatType& v, double tau) if (nu > 0) break; } - double theta = (simplexSum(rho) - tau) / rho; + const double theta = (simplexSum(rho) - tau) / rho; // Threshold on absolute value of v with theta. for (arma::uword j = 0; j < simplexSol.n_rows; j++) diff --git a/inst/include/ensmallen_bits/lbfgs/lbfgs_impl.hpp b/inst/include/ensmallen_bits/lbfgs/lbfgs_impl.hpp index fbcad95..28c1552 100644 --- a/inst/include/ensmallen_bits/lbfgs/lbfgs_impl.hpp +++ b/inst/include/ensmallen_bits/lbfgs/lbfgs_impl.hpp @@ -79,7 +79,7 @@ double L_BFGS::ChooseScalingFactor(const size_t iterationNum, { typedef typename CubeType::elem_type CubeElemType; - double scalingFactor = 1.0; + double scalingFactor; if (iterationNum > 0) { int previousPos = (iterationNum - 1) % numBasis; @@ -378,7 +378,7 @@ L_BFGS::Optimize(FunctionType& function, terminate |= Callback::EvaluateWithGradient(*this, f, iterate, functionValue, gradient, callbacks...); - ElemType prevFunctionValue = functionValue; + ElemType prevFunctionValue; // The main optimization loop. terminate |= Callback::BeginOptimization(*this, f, iterate, callbacks...); diff --git a/inst/include/ensmallen_bits/padam/padam_update.hpp b/inst/include/ensmallen_bits/padam/padam_update.hpp index 570b977..a4a6924 100644 --- a/inst/include/ensmallen_bits/padam/padam_update.hpp +++ b/inst/include/ensmallen_bits/padam/padam_update.hpp @@ -50,8 +50,7 @@ class PadamUpdate epsilon(epsilon), beta1(beta1), beta2(beta2), - partial(partial), - iteration(0) + partial(partial) { // Nothing to do. } @@ -76,11 +75,6 @@ class PadamUpdate //! Modify the partial adaptive parameter. double& Partial() { return partial; } - //! Get the current iteration number. - size_t Iteration() const { return iteration; } - //! Modify the current iteration number. - size_t& Iteration() { return iteration; } - /** * The UpdatePolicyType policy classes must contain an internal 'Policy' * template class with two template arguments: MatType and GradType. This is @@ -100,7 +94,8 @@ class PadamUpdate * @param cols Number of columns in the gradient matrix. */ Policy(PadamUpdate& parent, const size_t rows, const size_t cols) : - parent(parent) + parent(parent), + iteration(0) { m.zeros(rows, cols); v.zeros(rows, cols); @@ -119,7 +114,7 @@ class PadamUpdate const GradType& gradient) { // Increment the iteration counter variable. - ++parent.iteration; + ++iteration; // And update the iterate. m *= parent.beta1; @@ -128,10 +123,8 @@ class PadamUpdate v *= parent.beta2; v += (1 - parent.beta2) * (gradient % gradient); - const double biasCorrection1 = 1.0 - std::pow(parent.beta1, - parent.iteration); - const double biasCorrection2 = 1.0 - std::pow(parent.beta2, - parent.iteration); + const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration); + const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration); // Element wise maximum of past and present squared gradients. vImproved = arma::max(vImproved, v); @@ -152,6 +145,9 @@ class PadamUpdate //! The optimal sqaured gradient value. GradType vImproved; + + //! The number of iterations. + size_t iteration; }; private: @@ -166,9 +162,6 @@ class PadamUpdate //! Partial adaptive parameter. double partial; - - //! The number of iterations. - size_t iteration; }; } // namespace ens diff --git a/inst/include/ensmallen_bits/qhadam/qhadam_update.hpp b/inst/include/ensmallen_bits/qhadam/qhadam_update.hpp index 8540033..f408377 100644 --- a/inst/include/ensmallen_bits/qhadam/qhadam_update.hpp +++ b/inst/include/ensmallen_bits/qhadam/qhadam_update.hpp @@ -54,8 +54,7 @@ class QHAdamUpdate beta1(beta1), beta2(beta2), v1(v1), - v2(v2), - iteration(0) + v2(v2) { // Nothing to do. } @@ -75,11 +74,6 @@ class QHAdamUpdate //! Modify the second moment coefficient. double& Beta2() { return beta2; } - //! Get the current iteration number. - size_t Iteration() const { return iteration; } - //! Modify the current iteration number. - size_t& Iteration() { return iteration; } - //! Get the first quasi-hyperbolic term. double V1() const { return v1; } //! Modify the first quasi-hyperbolic term. @@ -109,7 +103,8 @@ class QHAdamUpdate * @param cols Number of columns in the gradient matrix. */ Policy(QHAdamUpdate& parent, const size_t rows, const size_t cols) : - parent(parent) + parent(parent), + iteration(0) { m.zeros(rows, cols); v.zeros(rows, cols); @@ -127,7 +122,7 @@ class QHAdamUpdate const GradType& gradient) { // Increment the iteration counter variable. - ++parent.iteration; + ++iteration; // And update the iterate. m *= parent.beta1; @@ -136,10 +131,8 @@ class QHAdamUpdate v *= parent.beta2; v += (1 - parent.beta2) * (gradient % gradient); - const double biasCorrection1 = 1.0 - std::pow(parent.beta1, - parent.iteration); - const double biasCorrection2 = 1.0 - std::pow(parent.beta2, - parent.iteration); + const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration); + const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration); GradType mDash = m / biasCorrection1; GradType vDash = v / biasCorrection2; @@ -160,6 +153,9 @@ class QHAdamUpdate // The exponential moving average of squared gradient values. GradType v; + + // The number of iterations. + size_t iteration; }; private: @@ -177,9 +173,6 @@ class QHAdamUpdate // The second quasi-hyperbolic term. double v2; - - // The number of iterations. - size_t iteration; }; } // namespace ens diff --git a/inst/include/ensmallen_bits/sa/sa_impl.hpp b/inst/include/ensmallen_bits/sa/sa_impl.hpp index 978b59e..f680f99 100644 --- a/inst/include/ensmallen_bits/sa/sa_impl.hpp +++ b/inst/include/ensmallen_bits/sa/sa_impl.hpp @@ -70,7 +70,7 @@ typename MatType::elem_type SA::Optimize( ElemType energy = function.Evaluate(iterate); Callback::Evaluate(*this, function, iterate, energy, callbacks...); - ElemType oldEnergy = energy; + ElemType oldEnergy; size_t idx = 0; size_t sweepCounter = 0; diff --git a/inst/include/ensmallen_bits/spalera_sgd/spalera_sgd_impl.hpp b/inst/include/ensmallen_bits/spalera_sgd/spalera_sgd_impl.hpp index 7fa8175..4043401 100644 --- a/inst/include/ensmallen_bits/spalera_sgd/spalera_sgd_impl.hpp +++ b/inst/include/ensmallen_bits/spalera_sgd/spalera_sgd_impl.hpp @@ -178,7 +178,6 @@ SPALeRASGD::Optimize( i += effectiveBatchSize; currentFunction += effectiveBatchSize; overallObjective += currentObjective; - currentObjective /= effectiveBatchSize; // Is this iteration the start of a sequence? if ((currentFunction % numFunctions) == 0) diff --git a/inst/include/ensmallen_bits/swats/swats_update.hpp b/inst/include/ensmallen_bits/swats/swats_update.hpp index 18f1524..0a80a77 100644 --- a/inst/include/ensmallen_bits/swats/swats_update.hpp +++ b/inst/include/ensmallen_bits/swats/swats_update.hpp @@ -50,7 +50,6 @@ class SWATSUpdate epsilon(epsilon), beta1(beta1), beta2(beta2), - iteration(0), phaseSGD(false), sgdRate(0), sgdLambda(0) @@ -73,11 +72,6 @@ class SWATSUpdate //! Modify the second moment coefficient. double& Beta2() { return beta2; } - //! Get the current iteration number. - size_t Iteration() const { return iteration; } - //! Modify the current iteration number. - size_t& Iteration() { return iteration; } - //! Get whether the current phase is SGD. bool PhaseSGD() const { return phaseSGD; } //! Modify whether the current phase is SGD. @@ -111,7 +105,8 @@ class SWATSUpdate * @param cols Number of columns in the gradient matrix. */ Policy(SWATSUpdate& parent, const size_t rows, const size_t cols) : - parent(parent) + parent(parent), + iteration(0) { m.zeros(rows, cols); v.zeros(rows, cols); @@ -131,7 +126,7 @@ class SWATSUpdate const GradType& gradient) { // Increment the iteration counter variable. - ++parent.iteration; + ++iteration; if (parent.phaseSGD) { @@ -150,10 +145,8 @@ class SWATSUpdate v *= parent.beta2; v += (1 - parent.beta2) * (gradient % gradient); - const double biasCorrection1 = 1.0 - std::pow(parent.beta1, - parent.iteration); - const double biasCorrection2 = 1.0 - std::pow(parent.beta2, - parent.iteration); + const double biasCorrection1 = 1.0 - std::pow(parent.beta1, iteration); + const double biasCorrection2 = 1.0 - std::pow(parent.beta2, iteration); GradType delta = stepSize * m / biasCorrection1 / (arma::sqrt(v / biasCorrection2) + parent.epsilon); @@ -167,8 +160,7 @@ class SWATSUpdate (1 - parent.beta2) * rate; parent.sgdRate = parent.sgdLambda / biasCorrection2; - if (std::abs(parent.sgdRate - rate) < parent.epsilon && - parent.iteration > 1) + if (std::abs(parent.sgdRate - rate) < parent.epsilon && iteration > 1) { parent.phaseSGD = true; v.zeros(); @@ -188,6 +180,9 @@ class SWATSUpdate //! The exponential moving average of squared gradient values (SGD). GradType sgdV; + + //! The number of iterations. + size_t iteration; }; private: @@ -200,9 +195,6 @@ class SWATSUpdate //! The second moment coefficient. double beta2; - //! The number of iterations. - size_t iteration; - //! Wether to use the SGD or Adam update rule. bool phaseSGD; diff --git a/inst/include/ensmallen_bits/yogi/yogi.hpp b/inst/include/ensmallen_bits/yogi/yogi.hpp new file mode 100644 index 0000000..4529d24 --- /dev/null +++ b/inst/include/ensmallen_bits/yogi/yogi.hpp @@ -0,0 +1,189 @@ +/** + * @file yogi.hpp + * @author Marcus Edel + * + * Class wrapper for the Yogi update Policy. Yogi is based on Adam with more + * fine grained effective learning rate control. + * + * ensmallen is free software; you may redistribute it and/or modify it under + * the terms of the 3-clause BSD license. You should have received a copy of + * the 3-clause BSD license along with ensmallen. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef ENSMALLEN_YOGI_YOGI_HPP +#define ENSMALLEN_YOGI_YOGI_HPP + +#include +#include "yogi_update.hpp" + +namespace ens { + +/** + * Yogi is an variation of Adam with more fine grained effective learning rate + * control. + * + * For more information, see the following. + * + * @code + * @inproceedings{Zaheer2018, + * author = {Zaheer, Manzil and Reddi, Sashank J. and Sachan, Devendra + * and Kale, Satyen and Kumar, Sanjiv}, + * title = {Adaptive Methods for Nonconvex Optimization}, + * year = {2018}, + * publisher = {Curran Associates Inc.}, + * booktitle = {Proceedings of the 32nd International Conference on Neural + * Information Processing Systems}, + * pages = {9815–9825}, + * series = {NIPS'18} + * } + * @endcode + * + * Yogi can optimize differentiable separable functions. For more details, + * see the documentation on function types included with this distribution or + * on the ensmallen website. + */ +class Yogi +{ + public: + /** + * Construct the Yogi optimizer with the given function and parameters. + * Yogi is sensitive to its paramters and hence a good hyper paramater + * selection is necessary as its default may not fit every case. + * + * The maximum number of iterations refers to the maximum number of + * points that are processed (i.e., one iteration equals one point; one + * iteration does not equal one pass over the dataset). + * + * @param stepSize Step size for each iteration. + * @param batchSize Number of points to process in a single step. + * @param beta1 Exponential decay rate for the first moment estimates. + * @param beta2 Exponential decay rate for the weighted infinity norm + * estimates. + * @param epsilon Value used to initialise the mean squared gradient + * parameter. + * @param maxIterations Maximum number of iterations allowed (0 means no + * limit). + * @param tolerance Maximum absolute tolerance to terminate algorithm. + * @param shuffle If true, the function order is shuffled; otherwise, each + * function is visited in linear order. + * @param resetPolicy If true, parameters are reset before every Optimize + * call; otherwise, their values are retained. + * @param exactObjective Calculate the exact objective (Default: estimate the + * final objective obtained on the last pass over the data). + */ + Yogi(const double stepSize = 0.001, + const size_t batchSize = 32, + const double beta1 = 0.9, + const double beta2 = 0.999, + const double epsilon = 1e-8, + const size_t maxIterations = 100000, + const double tolerance = 1e-5, + const bool shuffle = true, + const bool resetPolicy = true, + const bool exactObjective = false); + + /** + * Optimize the given function using Yogi. The given starting point will be + * modified to store the finishing point of the algorithm, and the final + * objective value is returned. + * + * @tparam SeparableFunctionType Type of the function to optimize. + * @tparam MatType Type of matrix to optimize with. + * @tparam GradType Type of matrix to use to represent function gradients. + * @tparam CallbackTypes Types of callback functions. + * @param function Function to optimize. + * @param iterate Starting point (will be modified). + * @param callbacks Callback functions. + * @return Objective value of the final point. + */ + template + typename std::enable_if::value, + typename MatType::elem_type>::type + Optimize(SeparableFunctionType& function, + MatType& iterate, + CallbackTypes&&... callbacks) + { + return optimizer.Optimize(function, iterate, + std::forward(callbacks)...); + } + + //! Forward the MatType as GradType. + template + typename MatType::elem_type Optimize(SeparableFunctionType& function, + MatType& iterate, + CallbackTypes&&... callbacks) + { + return Optimize(function, iterate, + std::forward(callbacks)...); + } + + //! Get the step size. + double StepSize() const { return optimizer.StepSize(); } + //! Modify the step size. + double& StepSize() { return optimizer.StepSize(); } + + //! Get the batch size. + size_t BatchSize() const { return optimizer.BatchSize(); } + //! Modify the batch size. + size_t& BatchSize() { return optimizer.BatchSize(); } + + //! Get the smoothing parameter. + double Beta1() const { return optimizer.UpdatePolicy().Beta1(); } + //! Modify the smoothing parameter. + double& Beta1() { return optimizer.UpdatePolicy().Beta1(); } + + //! Get the second moment coefficient. + double Beta2() const { return optimizer.UpdatePolicy().Beta2(); } + //! Modify the second moment coefficient. + double& Beta2() { return optimizer.UpdatePolicy().Beta2(); } + + //! Get the value used to initialise the mean squared gradient parameter. + double Epsilon() const { return optimizer.UpdatePolicy().Epsilon(); } + //! Modify the value used to initialise the mean squared gradient parameter. + double& Epsilon() { return optimizer.UpdatePolicy().Epsilon(); } + + //! Get the maximum number of iterations (0 indicates no limit). + size_t MaxIterations() const { return optimizer.MaxIterations(); } + //! Modify the maximum number of iterations (0 indicates no limit). + size_t& MaxIterations() { return optimizer.MaxIterations(); } + + //! Get the tolerance for termination. + double Tolerance() const { return optimizer.Tolerance(); } + //! Modify the tolerance for termination. + double& Tolerance() { return optimizer.Tolerance(); } + + //! Get whether or not the individual functions are shuffled. + bool Shuffle() const { return optimizer.Shuffle(); } + //! Modify whether or not the individual functions are shuffled. + bool& Shuffle() { return optimizer.Shuffle(); } + + //! Get whether or not the actual objective is calculated. + bool ExactObjective() const { return optimizer.ExactObjective(); } + //! Modify whether or not the actual objective is calculated. + bool& ExactObjective() { return optimizer.ExactObjective(); } + + //! Get whether or not the update policy parameters are reset before + //! Optimize call. + bool ResetPolicy() const { return optimizer.ResetPolicy(); } + //! Modify whether or not the update policy parameters + //! are reset before Optimize call. + bool& ResetPolicy() { return optimizer.ResetPolicy(); } + + private: + //! The Stochastic Gradient Descent object with Yogi policy. + SGD optimizer; +}; + +} // namespace ens + +// Include implementation. +#include "yogi_impl.hpp" + +#endif diff --git a/inst/include/ensmallen_bits/yogi/yogi_impl.hpp b/inst/include/ensmallen_bits/yogi/yogi_impl.hpp new file mode 100644 index 0000000..39a777b --- /dev/null +++ b/inst/include/ensmallen_bits/yogi/yogi_impl.hpp @@ -0,0 +1,44 @@ +/** + * @file yogi_impl.hpp + * @author Marcus Edel + * + * Implementation of Yogi class wrapper. + * + * ensmallen is free software; you may redistribute it and/or modify it under + * the terms of the 3-clause BSD license. You should have received a copy of + * the 3-clause BSD license along with ensmallen. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef ENSMALLEN_YOGI_YOGI_IMPL_HPP +#define ENSMALLEN_YOGI_YOGI_IMPL_HPP + +// In case it hasn't been included yet. +#include "yogi.hpp" + +namespace ens { + +inline Yogi::Yogi( + const double stepSize, + const size_t batchSize, + const double beta1, + const double beta2, + const double epsilon, + const size_t maxIterations, + const double tolerance, + const bool shuffle, + const bool resetPolicy, + const bool exactObjective) : + optimizer(stepSize, + batchSize, + maxIterations, + tolerance, + shuffle, + YogiUpdate(epsilon, beta1, beta2), + NoDecay(), + resetPolicy, + exactObjective) +{ /* Nothing to do. */ } + +} // namespace ens + + #endif diff --git a/inst/include/ensmallen_bits/yogi/yogi_update.hpp b/inst/include/ensmallen_bits/yogi/yogi_update.hpp new file mode 100644 index 0000000..cdba28d --- /dev/null +++ b/inst/include/ensmallen_bits/yogi/yogi_update.hpp @@ -0,0 +1,146 @@ +/** + * @file yogi_update.hpp + * @author Marcus Edel + * + * Implements the Yogi Optimizer. Yogi is a variant of Adam with more fine + * grained effective learning rate control. + * + * ensmallen is free software; you may redistribute it and/or modify it under + * the terms of the 3-clause BSD license. You should have received a copy of + * the 3-clause BSD license along with ensmallen. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ +#ifndef ENSMALLEN_YOGI_YOGI_UPDATE_HPP +#define ENSMALLEN_YOGI_YOGI_UPDATE_HPP + +namespace ens { + +/** + * Yogi builds upon the Adam update strategy but provides more fine grained + * effective learning rate control. + * + * For more information, see the following. + * + * @code + * @inproceedings{Zaheer2018, + * author = {Zaheer, Manzil and Reddi, Sashank J. and Sachan, Devendra + * and Kale, Satyen and Kumar, Sanjiv}, + * title = {Adaptive Methods for Nonconvex Optimization}, + * year = {2018}, + * publisher = {Curran Associates Inc.}, + * booktitle = {Proceedings of the 32nd International Conference on Neural + * Information Processing Systems}, + * pages = {9815–9825}, + * series = {NIPS'18} + * } + * @endcode + */ +class YogiUpdate +{ + public: + /** + * Construct the Yogi update policy with the given parameters. + * + * @param epsilon The epsilon value used to initialise the squared gradient + * parameter. + * @param beta1 The smoothing parameter. + * @param beta2 The second moment coefficient. + * @param v1 The first quasi-hyperbolic term. + * @param v1 The second quasi-hyperbolic term. + */ + YogiUpdate(const double epsilon = 1e-8, + const double beta1 = 0.9, + const double beta2 = 0.999) : + epsilon(epsilon), + beta1(beta1), + beta2(beta2) + { + // Nothing to do. + } + + //! Get the value used to initialise the squared gradient parameter. + double Epsilon() const { return epsilon; } + //! Modify the value used to initialise the squared gradient parameter. + double& Epsilon() { return epsilon; } + + //! Get the smoothing parameter. + double Beta1() const { return beta1; } + //! Modify the smoothing parameter. + double& Beta1() { return beta1; } + + //! Get the second moment coefficient. + double Beta2() const { return beta2; } + //! Modify the second moment coefficient. + double& Beta2() { return beta2; } + + /** + * The UpdatePolicyType policy classes must contain an internal 'Policy' + * template class with two template arguments: MatType and GradType. This is + * instantiated at the start of the optimization, and holds parameters + * specific to an individual optimization. + */ + template + class Policy + { + public: + /** + * This constructor is called by the SGD Optimize() method before the start + * of the iteration update process. + * + * @param parent YogiUpdate object. + * @param rows Number of rows in the gradient matrix. + * @param cols Number of columns in the gradient matrix. + */ + Policy(YogiUpdate& parent, const size_t rows, const size_t cols) : + parent(parent) + { + m.zeros(rows, cols); + v.zeros(rows, cols); + } + + /** + * Update step for Yogi. + * + * @param iterate Parameters that minimize the function. + * @param stepSize Step size to be used for the given iteration. + * @param gradient The gradient matrix. + */ + void Update(MatType& iterate, + const double stepSize, + const GradType& gradient) + { + m *= parent.beta1; + m += (1 - parent.beta1) * gradient; + + const MatType gSquared = arma::square(gradient); + v -= (1 - parent.beta2) * arma::sign(v - gSquared) % gSquared; + + // Now update the iterate. + iterate -= stepSize * m / (arma::sqrt(v) + parent.epsilon); + } + + private: + //! Instantiated parent object. + YogiUpdate& parent; + + //! The exponential moving average of gradient values. + GradType m; + + // The exponential moving average of squared gradient values. + GradType v; + }; + + private: + // The epsilon value used to initialise the squared gradient parameter. + double epsilon; + + // The smoothing parameter. + double beta1; + + // The second moment coefficient. + double beta2; +}; + +} // namespace ens + +#endif diff --git a/tools/HISTORYold.md b/tools/HISTORYold.md index 3d8296b..a5978c4 100644 --- a/tools/HISTORYold.md +++ b/tools/HISTORYold.md @@ -1,3 +1,26 @@ +### ensmallen ?.??.?: "???" +###### ????-??-?? + +### ensmallen 2.19.0: "Eight Ball Deluxe" +###### 2022-04-06 +* Added DemonSGD and DemonAdam optimizers + ([#211](https://github.com/mlpack/ensmallen/pull/211)). + + * Fix bug with Adam-like optimizers not resetting when `resetPolicy` is `true`. + ([#340](https://github.com/mlpack/ensmallen/pull/340)). + + * Add Yogi optimizer + ([#232](https://github.com/mlpack/ensmallen/pull/232)). + + * Add AdaBelief optimizer + ([#233](https://github.com/mlpack/ensmallen/pull/233)). + + * Add AdaSqrt optimizer + ([#234](https://github.com/mlpack/ensmallen/pull/234)). + + * Bump check for minimum supported version of Armadillo + ([#342](https://github.com/mlpack/ensmallen/pull/342)). + ### ensmallen 2.18.2: "Fairmount Bagel" ###### 2022-02-13 * Update Catch2 to 2.13.8