diff --git a/CHANGELOG.md b/CHANGELOG.md index d3e6877280..bccbdd419b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ - Added SVRG and LSVRG stochastic functions (#1625) - Added SAG and SAGA stochastic functions (#1624) - Allow `SumFunction` with 1 item (#1857) + - Added Barzilai-Borwein step size rule to work with GD, ISTA, FISTA (#1859) - Added callback `optimisation.utilities.callbacks.EarlyStoppingObjectiveValue` which stops iterations if an algorithm objective changes less than a provided threshold (#1892) - Added callback `optimisation.utilities.callbacks.CGLSEarlyStopping` which replicates the automatic behaviour of CGLS in CIL versions <=24. (#1892) - Enhancements: diff --git a/Wrappers/Python/cil/optimisation/utilities/StepSizeMethods.py b/Wrappers/Python/cil/optimisation/utilities/StepSizeMethods.py index ea5ca0ebdb..d7a5c47942 100644 --- a/Wrappers/Python/cil/optimisation/utilities/StepSizeMethods.py +++ b/Wrappers/Python/cil/optimisation/utilities/StepSizeMethods.py @@ -18,7 +18,7 @@ from abc import ABC, abstractmethod import numpy - +from numbers import Number class StepSizeRule(ABC): """ @@ -70,8 +70,7 @@ def get_step_size(self, algorithm): class ArmijoStepSizeRule(StepSizeRule): - """ - Applies the Armijo rule to calculate the step size (step_size). + r""" Applies the Armijo rule to calculate the step size (step_size). The Armijo rule runs a while loop to find the appropriate step_size by starting from a very large number (`alpha`). The step_size is found by reducing the step size (by a factor `beta`) in an iterative way until a certain criterion is met. To avoid infinite loops, we add a maximum number of times (`max_iterations`) the while loop is run. @@ -85,9 +84,10 @@ class ArmijoStepSizeRule(StepSizeRule): The maximum number of iterations to find a suitable step size Reference - --------- - Algorithm 3.1 (Numerical Optimization, Nocedal, Wright) (https://www.math.uci.edu/~qnie/Publications/NumericalOptimization.pdf) - https://projecteuclid.org/download/pdf_1/euclid.pjm/1102995080 + ------------ + - Algorithm 3.1 in Nocedal, J. and Wright, S.J. eds., 1999. Numerical optimization. New York, NY: Springer New York. https://www.math.uci.edu/~qnie/Publications/NumericalOptimization.pdf) + + - https://projecteuclid.org/download/pdf_1/euclid.pjm/1102995080 """ @@ -138,3 +138,110 @@ def get_step_size(self, algorithm): raise ValueError( 'Could not find a proper step_size in {} loops. Consider increasing alpha or max_iterations.'.format(self.max_iterations)) return self.alpha + + +class BarzilaiBorweinStepSizeRule(StepSizeRule): + + r""" Applies the Barzilai- Borwein rule to calculate the step size (step_size). + + Let :math:`\Delta x=x_k-x_{k-1}` and :math:`\Delta g=g_k-g_{k-1}`. Where :math:`x_k` is the :math:`k` th iterate (current solution after iteration :math:`k` ) and :math:`g_k` is the gradient calculation in the :math:`k` th iterate, found in :code:`algorithm.gradient_update`. A Barzilai-Borwein (BB) iteration is :math:`x_{k+1}=x_k-\alpha_kg_k` where the step size :math:`\alpha _k` is either + + - :math:`\alpha_k^{LONG}=\frac{\Delta x\cdot\Delta x}{\Delta x\cdot\Delta g}`, or + + - :math:`\alpha_k^{SHORT}=\frac{\Delta x \cdot\Delta g}{\Delta g \cdot\Delta g}`. + + Where the operator :math:`\cdot` is the standard inner product between two vectors. + + This is suitable for use with gradient based iterative methods where the calculated gradient is stored as `algorithm.gradient_update`. + + Parameters + ---------- + initial: float, greater than zero + The step-size for the first iteration. We recommend something of the order :math:`1/f.L` where :math:`f` is the (differentiable part of) the objective you wish to minimise. + mode: One of 'long', 'short' or 'alternate', default is 'short'. + This calculates the step-size based on the LONG, SHORT or alternating between the two, starting with short. + stabilisation_param: 'auto', float or 'off', default is 'auto' + In order to add stability the step-size has an upper limit of :math:`\Delta/\|g_k\|` where by 'default', the `stabilisation_param`, :math:`\Delta` is determined automatically to be the minimium of :math:`\Delta x` from the first 3 iterations. The user can also pass a fixed constant or turn "off" the stabilisation, equivalently passing `np.inf`. + + + Reference + --------- + - Barzilai, Jonathan; Borwein, Jonathan M. (1988). "Two-Point Step Size Gradient Methods". IMA Journal of Numerical Analysis. 8: 141–148, https://doi.org/10.1093/imanum/8.1.141 + + - Burdakov, O., Dai, Y. and Huang, N., 2019. STABILIZED BARZILAI-BORWEIN METHOD. Journal of Computational Mathematics, 37(6). https://doi.org/10.4208/jcm.1911-m2019-0171 + + - https://en.wikipedia.org/wiki/Barzilai-Borwein_method + """ + + def __init__(self, initial, mode='short', stabilisation_param="auto"): + '''Initialises the step size rule + ''' + + self.mode=mode + if self.mode == 'short': + self.is_short = True + elif self.mode == 'long' or self.mode == 'alternate': + self.is_short = False + else: + raise ValueError('Mode should be chosen from "long", "short" or "alternate". ') + + self.store_grad=None + self.store_x=None + self.initial=initial + if stabilisation_param == 'auto': + self.adaptive = True + stabilisation_param = numpy.inf + elif stabilisation_param == "off": + self.adaptive = False + stabilisation_param = numpy.inf + elif ( isinstance(stabilisation_param, Number) and stabilisation_param >=0): + self.adaptive = False + else: + raise TypeError(" The stabilisation_param should be 'auto', a positive number or 'off'") + self.stabilisation_param=stabilisation_param + + + + def get_step_size(self, algorithm): + """ + Applies the B-B rule to calculate the step size (`step_size`) + + Returns + -------- + the calculated step size:float + + """ + #For the first iteration we use an initial step size because the BB step size requires a previous iterate. + if self.store_x is None: + self.store_x=algorithm.x.copy() # We store the last iterate in order to calculate the BB step size + self.store_grad=algorithm.gradient_update.copy()# We store the last gradient in order to calculate the BB step size + return self.initial + + gradient_norm = algorithm.gradient_update.norm() + #If the gradient is zero, gradient based algorithms will not update and te step size calculation will divide by zero so we stop iterations. + if gradient_norm < 1e-8: + raise StopIteration + + algorithm.x.subtract(self.store_x, out=self.store_x) + algorithm.gradient_update.subtract(self.store_grad, out=self.store_grad) + if self.is_short: + ret = (self.store_x.dot(self.store_grad))/ (self.store_grad.dot(self.store_grad)) + else: + ret = (self.store_x.dot(self.store_x))/ (self.store_x.dot(self.store_grad)) + + + #This computes the default stabilisation parameter, using the first three iterations + if (algorithm.iteration <=3 and self.adaptive): + self.stabilisation_param = min(self.stabilisation_param, self.store_x.norm() ) + + # Computes the step size as the minimum of the ret, above, and :math:`\Delta/\|g_k\|` ignoring any NaN values. + ret = numpy.nanmin( numpy.array([ret, self.stabilisation_param/gradient_norm])) + + # We store the last iterate and gradient in order to calculate the BB step size + self.store_x.fill(algorithm.x) + self.store_grad.fill(algorithm.gradient_update) + + if self.mode == "alternate": + self.is_short = not self.is_short + + return ret \ No newline at end of file diff --git a/Wrappers/Python/cil/optimisation/utilities/__init__.py b/Wrappers/Python/cil/optimisation/utilities/__init__.py index cde00acbae..daa32667a0 100644 --- a/Wrappers/Python/cil/optimisation/utilities/__init__.py +++ b/Wrappers/Python/cil/optimisation/utilities/__init__.py @@ -19,5 +19,5 @@ from .sampler import Sampler from .sampler import SamplerRandom -from .StepSizeMethods import ConstantStepSize, ArmijoStepSizeRule, StepSizeRule +from .StepSizeMethods import ConstantStepSize, ArmijoStepSizeRule, StepSizeRule, BarzilaiBorweinStepSizeRule from .preconditioner import Preconditioner, AdaptiveSensitivity, Sensitivity diff --git a/Wrappers/Python/test/test_stepsizes.py b/Wrappers/Python/test/test_stepsizes.py index be601eb7ac..860da40896 100644 --- a/Wrappers/Python/test/test_stepsizes.py +++ b/Wrappers/Python/test/test_stepsizes.py @@ -1,9 +1,9 @@ from cil.optimisation.algorithms import SIRT, GD, ISTA, FISTA from cil.optimisation.functions import LeastSquares, IndicatorBox -from cil.framework import ImageGeometry, VectorGeometry +from cil.framework import ImageGeometry, VectorGeometry, VectorData from cil.optimisation.operators import IdentityOperator, MatrixOperator -from cil.optimisation.utilities import Sensitivity, AdaptiveSensitivity, Preconditioner, ConstantStepSize, ArmijoStepSizeRule +from cil.optimisation.utilities import Sensitivity, AdaptiveSensitivity, Preconditioner, ConstantStepSize, ArmijoStepSizeRule, BarzilaiBorweinStepSizeRule import numpy as np from testclass import CCPiTestClass @@ -77,3 +77,169 @@ def test_armijo_calculation(self): alg.gradient_update = ig.allocate(-2) step_size = test_stepsize.get_step_size(alg) self.assertAlmostEqual(step_size, 2) + + def test_bb(self): + n = 10 + m = 5 + + A = np.random.uniform(0, 1, (m, n)).astype('float32') + b = (A.dot(np.random.randn(n)) + 0.1 * + np.random.randn(m)).astype('float32') + + Aop = MatrixOperator(A) + bop = VectorData(b) + ig=Aop.domain + initial = ig.allocate() + f = LeastSquares(Aop, b=bop, c=0.5) + + ss_rule=BarzilaiBorweinStepSizeRule(2 ) + self.assertEqual(ss_rule.mode, 'short') + self.assertEqual(ss_rule.initial, 2) + self.assertEqual(ss_rule.adaptive, True) + self.assertEqual(ss_rule.stabilisation_param, np.inf) + + #Check the right errors are raised for incorrect parameters + + with self.assertRaises(TypeError): + ss_rule=BarzilaiBorweinStepSizeRule(2,'short',-4, ) + with self.assertRaises(TypeError): + ss_rule=BarzilaiBorweinStepSizeRule(2,'long', 'banana', ) + with self.assertRaises(ValueError): + ss_rule=BarzilaiBorweinStepSizeRule(2, 'banana',3 ) + + + #Check stabilisation parameter unchanged if fixed + ss_rule=BarzilaiBorweinStepSizeRule(2, 'long',3 ) + self.assertEqual(ss_rule.mode, 'long') + self.assertFalse(ss_rule.adaptive) + alg = GD(initial=initial, objective_function=f, step_size=ss_rule) + self.assertEqual(ss_rule.stabilisation_param,3) + alg.run(2) + self.assertEqual(ss_rule.stabilisation_param,3) + + #Check infinity can be passed + ss_rule=BarzilaiBorweinStepSizeRule(2, 'short',"off" ) + self.assertEqual(ss_rule.mode, 'short') + self.assertFalse(ss_rule.adaptive) + self.assertEqual(ss_rule.stabilisation_param,np.inf) + alg = GD(initial=initial, objective_function=f, step_size=ss_rule) + alg.run(2) + + n = 5 + m = 5 + + A = np.eye(5).astype('float32') + b = (np.array([.5,.5,.5,.5,.5])).astype('float32') + + Aop = MatrixOperator(A) + bop = VectorData(b) + ig=Aop.domain + initial = ig.allocate(0) + f = LeastSquares(Aop, b=bop, c=0.5) + ss_rule=BarzilaiBorweinStepSizeRule(0.22, 'long',np.inf ) + alg = GD(initial=initial, objective_function=f, step_size=ss_rule) + self.assertFalse(ss_rule.is_short) + #Check the initial step size was used + alg.run(1) + self.assertNumpyArrayAlmostEqual( np.array([.11,.11,.11,.11,.11]), alg.x.as_array() ) + self.assertFalse(ss_rule.is_short) + #check long + alg.run(1) + x_change= np.array([.11,.11,.11,.11,.11])-np.array([0,0,0,0,0]) + grad_change = -np.array([.39,.39,.39,.39,.39])+np.array([.5,.5,.5,.5,.5]) + step= x_change.dot(x_change)/x_change.dot(grad_change) + self.assertNumpyArrayAlmostEqual( np.array([.11,.11,.11,.11,.11])+step*np.array([.39,.39,.39,.39,.39]), alg.x.as_array() ) + self.assertFalse(ss_rule.is_short) + + ss_rule=BarzilaiBorweinStepSizeRule(0.22, 'short',np.inf ) + alg = GD(initial=initial, objective_function=f, step_size=ss_rule) + self.assertTrue(ss_rule.is_short) + #Check the initial step size was used + alg.run(1) + self.assertNumpyArrayAlmostEqual( np.array([.11,.11,.11,.11,.11]), alg.x.as_array() ) + self.assertTrue(ss_rule.is_short) + #check short + alg.run(1) + x_change= np.array([.11,.11,.11,.11,.11])-np.array([0,0,0,0,0]) + grad_change = -np.array([.39,.39,.39,.39,.39])+np.array([.5,.5,.5,.5,.5]) + step= x_change.dot(grad_change)/grad_change.dot(grad_change) + self.assertNumpyArrayAlmostEqual( np.array([.11,.11,.11,.11,.11])+step*np.array([.39,.39,.39,.39,.39]), alg.x.as_array() ) + self.assertTrue(ss_rule.is_short) + + #check stop iteration + ss_rule=BarzilaiBorweinStepSizeRule(1, 'long',np.inf ) + alg = GD(initial=initial, objective_function=f, step_size=ss_rule) + alg.run(500) + self.assertEqual(alg.iteration, 1) + + #check adaptive + ss_rule=BarzilaiBorweinStepSizeRule(0.001, 'long',"auto") + alg = GD(initial=initial, objective_function=f, step_size=ss_rule) + self.assertEqual(ss_rule.stabilisation_param, np.inf) + alg.run(2) + self.assertNotEqual(ss_rule.stabilisation_param, np.inf) + + #check stops being adaptive + + ss_rule=BarzilaiBorweinStepSizeRule(0.0000001, 'long',"auto" ) + alg = GD(initial=initial, objective_function=f, step_size=ss_rule) + self.assertEqual(ss_rule.stabilisation_param, np.inf) + alg.run(4) + self.assertNotEqual(ss_rule.stabilisation_param, np.inf) + a=ss_rule.stabilisation_param + alg.run(1) + self.assertEqual(ss_rule.stabilisation_param, a) + + #Test alternating + ss_rule=BarzilaiBorweinStepSizeRule(0.0000001, 'alternate',"auto" ) + alg = GD(initial=initial, objective_function=f, step_size=ss_rule) + self.assertFalse(ss_rule.is_short) + alg.run(2) + self.assertTrue(ss_rule.is_short) + alg.run(1) + self.assertFalse(ss_rule.is_short) + alg.run(1) + self.assertTrue(ss_rule.is_short) + + + + + def test_bb_converge(self): + n = 10 + m = 5 + np.random.seed(4) + A = np.random.uniform(0, 1, (m, n)).astype('float32') + b = (A.dot(np.random.randn(n)) + 0.1 * + np.random.randn(m)).astype('float32') + + Aop = MatrixOperator(A) + bop = VectorData(b) + ig=Aop.domain + initial = ig.allocate() + f = LeastSquares(Aop, b=bop, c=2) + + ss_rule=ArmijoStepSizeRule(max_iterations=40) + alg_true = GD(initial=initial, objective_function=f, step_size=ss_rule) + alg_true .run(300, verbose=0) + + + + ss_rule=BarzilaiBorweinStepSizeRule(1/f.L, 'short') + alg = GD(initial=initial, objective_function=f, step_size=ss_rule) + alg.run(80, verbose=0) + self.assertNumpyArrayAlmostEqual(alg.x.as_array(), alg_true.x.as_array(), decimal=3) + + + ss_rule=BarzilaiBorweinStepSizeRule(1/f.L, 'long') + alg = GD(initial=initial, objective_function=f, step_size=ss_rule) + alg.run(80, verbose=0) + self.assertNumpyArrayAlmostEqual(alg.x.as_array(), alg_true.x.as_array(), decimal=3) + + + ss_rule=BarzilaiBorweinStepSizeRule(1/f.L, 'alternate') + alg = GD(initial=initial, objective_function=f, step_size=ss_rule) + + alg.run(80, verbose=0) + self.assertNumpyArrayAlmostEqual(alg.x.as_array(), alg_true.x.as_array(), decimal=3) + + diff --git a/docs/source/optimisation.rst b/docs/source/optimisation.rst index ec4381cce0..6e2268d67f 100644 --- a/docs/source/optimisation.rst +++ b/docs/source/optimisation.rst @@ -664,6 +664,10 @@ We also have a number of example classes: .. autoclass:: cil.optimisation.utilities.StepSizeMethods.ArmijoStepSizeRule :members: +.. autoclass:: cil.optimisation.utilities.StepSizeMethods.BarzilaiBorweinStepSizeRule + :members: + + Preconditioners ----------------