Added BB step size rule to our step size methods (#1859)

--------- Signed-off-by: Margaret Duff <43645617+MargaretDuff@users.noreply.github.com>
TomographicImaging · Aug 22, 2024 · 1088bc3 · 1088bc3
1 parent 6d9978b
commit 1088bc3
Show file tree

Hide file tree

Showing 5 changed files with 287 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
     - Added SVRG and LSVRG stochastic functions (#1625)
     - Added SAG and SAGA stochastic functions (#1624)
     - Allow `SumFunction` with 1 item (#1857)
+    - Added Barzilai-Borwein step size rule to work with GD, ISTA, FISTA (#1859)
     - Added callback `optimisation.utilities.callbacks.EarlyStoppingObjectiveValue` which stops iterations if an algorithm objective changes less than a provided threshold (#1892)
     - Added callback `optimisation.utilities.callbacks.CGLSEarlyStopping` which replicates the automatic behaviour of CGLS in CIL versions <=24. (#1892)
   - Enhancements:

diff --git a/Wrappers/Python/cil/optimisation/utilities/StepSizeMethods.py b/Wrappers/Python/cil/optimisation/utilities/StepSizeMethods.py
@@ -18,7 +18,7 @@
 
 from abc import ABC, abstractmethod
 import numpy
-
+from numbers import Number
 
 class StepSizeRule(ABC):
     """
@@ -70,8 +70,7 @@ def get_step_size(self, algorithm):
 
 class ArmijoStepSizeRule(StepSizeRule):
 
-    """ 
-    Applies the Armijo rule to calculate the step size (step_size).
+    r""" Applies the Armijo rule to calculate the step size (step_size).
 
     The Armijo rule runs a while loop to find the appropriate step_size by starting from a very large number (`alpha`). The step_size is found by reducing the step size (by a factor `beta`) in an iterative way until a certain criterion is met. To avoid infinite loops, we add a maximum number of times (`max_iterations`) the while loop is run.
 
@@ -85,9 +84,10 @@ class ArmijoStepSizeRule(StepSizeRule):
         The maximum number of iterations to find a suitable step size 
 
     Reference
-    ---------
-    Algorithm 3.1 (Numerical Optimization, Nocedal, Wright) (https://www.math.uci.edu/~qnie/Publications/NumericalOptimization.pdf)
-     https://projecteuclid.org/download/pdf_1/euclid.pjm/1102995080
+    ------------
+    - Algorithm 3.1 in Nocedal, J. and Wright, S.J. eds., 1999. Numerical optimization. New York, NY: Springer New York. https://www.math.uci.edu/~qnie/Publications/NumericalOptimization.pdf)
+    
+    - https://projecteuclid.org/download/pdf_1/euclid.pjm/1102995080
 
     """
 
@@ -138,3 +138,110 @@ def get_step_size(self, algorithm):
             raise ValueError(
                 'Could not find a proper step_size in {} loops. Consider increasing alpha or max_iterations.'.format(self.max_iterations))
         return self.alpha
+
+
+class BarzilaiBorweinStepSizeRule(StepSizeRule):
+
+    r""" Applies the Barzilai- Borwein rule to calculate the step size (step_size).
+
+    Let :math:`\Delta x=x_k-x_{k-1}` and :math:`\Delta g=g_k-g_{k-1}`. Where :math:`x_k` is the :math:`k` th iterate (current solution after iteration :math:`k` ) and :math:`g_k` is the gradient calculation in the :math:`k` th iterate, found in :code:`algorithm.gradient_update`.  A Barzilai-Borwein (BB) iteration is :math:`x_{k+1}=x_k-\alpha_kg_k` where the step size :math:`\alpha _k` is either
+
+    - :math:`\alpha_k^{LONG}=\frac{\Delta x\cdot\Delta x}{\Delta x\cdot\Delta g}`, or
+
+    - :math:`\alpha_k^{SHORT}=\frac{\Delta x \cdot\Delta g}{\Delta g \cdot\Delta g}`.
+    
+    Where the operator :math:`\cdot` is the standard inner product between two vectors. 
+    
+    This is suitable for use with gradient based iterative methods where the calculated gradient is stored as `algorithm.gradient_update`.
+    
+    Parameters
+    ----------
+    initial: float, greater than zero 
+        The step-size for the first iteration. We recommend something of the order :math:`1/f.L` where :math:`f` is the (differentiable part of) the objective you wish to minimise.
+    mode: One of 'long', 'short' or 'alternate', default is 'short'. 
+        This calculates the step-size based on the LONG, SHORT or alternating between the two, starting with short. 
+    stabilisation_param: 'auto', float or 'off', default is 'auto'
+        In order to add stability the step-size has an upper limit of :math:`\Delta/\|g_k\|` where by 'default', the `stabilisation_param`, :math:`\Delta` is  determined automatically to be the minimium of :math:`\Delta x` from the first 3 iterations. The user can also pass a fixed constant or turn "off" the stabilisation, equivalently passing `np.inf`.
+        
+    
+    Reference
+    ---------
+    - Barzilai, Jonathan; Borwein, Jonathan M. (1988). "Two-Point Step Size Gradient Methods". IMA Journal of Numerical Analysis. 8: 141–148, https://doi.org/10.1093/imanum/8.1.141
+    
+    - Burdakov, O., Dai, Y. and Huang, N., 2019. STABILIZED BARZILAI-BORWEIN METHOD. Journal of Computational Mathematics, 37(6). https://doi.org/10.4208/jcm.1911-m2019-0171
+
+    - https://en.wikipedia.org/wiki/Barzilai-Borwein_method
+    """
+
+    def __init__(self, initial, mode='short', stabilisation_param="auto"):
+        '''Initialises the step size rule 
+        '''
+
+        self.mode=mode
+        if self.mode == 'short':
+            self.is_short = True
+        elif self.mode == 'long' or self.mode == 'alternate':
+            self.is_short = False
+        else:
+            raise ValueError('Mode should be chosen from "long", "short" or "alternate". ')
+
+        self.store_grad=None 
+        self.store_x=None
+        self.initial=initial
+        if stabilisation_param == 'auto':
+            self.adaptive = True
+            stabilisation_param = numpy.inf
+        elif stabilisation_param == "off":
+            self.adaptive = False 
+            stabilisation_param = numpy.inf
+        elif ( isinstance(stabilisation_param, Number) and stabilisation_param >=0):
+            self.adaptive = False 
+        else:
+            raise TypeError(" The stabilisation_param should be 'auto', a positive number or 'off'")
+        self.stabilisation_param=stabilisation_param
+
+
+
+    def get_step_size(self, algorithm):
+        """
+        Applies the B-B rule to calculate the step size (`step_size`)
+
+        Returns
+        --------
+        the calculated step size:float
+
+        """
+        #For the first iteration we use an initial step size because the BB step size requires a previous iterate. 
+        if self.store_x is None:
+            self.store_x=algorithm.x.copy() # We store the last iterate in order to calculate the BB step size 
+            self.store_grad=algorithm.gradient_update.copy()# We store the last gradient in order to calculate the BB step size 
+            return self.initial
+
+        gradient_norm = algorithm.gradient_update.norm()
+        #If the gradient is zero, gradient based algorithms will not update and te step size calculation will divide by zero so we stop iterations. 
+        if gradient_norm < 1e-8:
+            raise StopIteration
+
+        algorithm.x.subtract(self.store_x, out=self.store_x) 
+        algorithm.gradient_update.subtract(self.store_grad, out=self.store_grad)
+        if self.is_short:
+                ret = (self.store_x.dot(self.store_grad))/ (self.store_grad.dot(self.store_grad))
+        else:
+            ret = (self.store_x.dot(self.store_x))/ (self.store_x.dot(self.store_grad))
+
+
+        #This computes the default stabilisation parameter, using the first three iterations
+        if (algorithm.iteration <=3 and self.adaptive):
+            self.stabilisation_param = min(self.stabilisation_param, self.store_x.norm() )
+
+        # Computes the step size as the minimum of the ret, above, and :math:`\Delta/\|g_k\|` ignoring any NaN values. 
+        ret = numpy.nanmin( numpy.array([ret, self.stabilisation_param/gradient_norm]))
+
+        # We store the last iterate and gradient in order to calculate the BB step size 
+        self.store_x.fill(algorithm.x)
+        self.store_grad.fill(algorithm.gradient_update)
+
+        if self.mode == "alternate":
+            self.is_short =  not self.is_short       
+
+        return ret
diff --git a/Wrappers/Python/cil/optimisation/utilities/__init__.py b/Wrappers/Python/cil/optimisation/utilities/__init__.py
@@ -19,5 +19,5 @@
 
 from .sampler import Sampler
 from .sampler import SamplerRandom
-from .StepSizeMethods import ConstantStepSize, ArmijoStepSizeRule, StepSizeRule
+from .StepSizeMethods import ConstantStepSize, ArmijoStepSizeRule, StepSizeRule, BarzilaiBorweinStepSizeRule
 from .preconditioner import  Preconditioner, AdaptiveSensitivity, Sensitivity
diff --git a/Wrappers/Python/test/test_stepsizes.py b/Wrappers/Python/test/test_stepsizes.py
@@ -1,9 +1,9 @@
 from cil.optimisation.algorithms import SIRT, GD, ISTA, FISTA
 from cil.optimisation.functions import LeastSquares, IndicatorBox
-from cil.framework import ImageGeometry, VectorGeometry
+from cil.framework import ImageGeometry, VectorGeometry, VectorData
 from cil.optimisation.operators import IdentityOperator, MatrixOperator
 
-from cil.optimisation.utilities import Sensitivity, AdaptiveSensitivity, Preconditioner, ConstantStepSize, ArmijoStepSizeRule
+from cil.optimisation.utilities import Sensitivity, AdaptiveSensitivity, Preconditioner, ConstantStepSize, ArmijoStepSizeRule, BarzilaiBorweinStepSizeRule
 import numpy as np
 
 from testclass import CCPiTestClass
@@ -77,3 +77,169 @@ def test_armijo_calculation(self):
         alg.gradient_update = ig.allocate(-2)
         step_size = test_stepsize.get_step_size(alg)
         self.assertAlmostEqual(step_size, 2)
+
+    def test_bb(self):
+        n = 10
+        m = 5
+
+        A = np.random.uniform(0, 1, (m, n)).astype('float32')
+        b = (A.dot(np.random.randn(n)) + 0.1 *
+             np.random.randn(m)).astype('float32')
+
+        Aop = MatrixOperator(A)
+        bop = VectorData(b)
+        ig=Aop.domain
+        initial = ig.allocate()
+        f = LeastSquares(Aop, b=bop, c=0.5)
+
+        ss_rule=BarzilaiBorweinStepSizeRule(2 )
+        self.assertEqual(ss_rule.mode, 'short')
+        self.assertEqual(ss_rule.initial, 2)
+        self.assertEqual(ss_rule.adaptive, True)
+        self.assertEqual(ss_rule.stabilisation_param, np.inf)
+
+        #Check the right errors are raised for incorrect parameters 
+
+        with self.assertRaises(TypeError):
+            ss_rule=BarzilaiBorweinStepSizeRule(2,'short',-4, )
+        with self.assertRaises(TypeError):
+            ss_rule=BarzilaiBorweinStepSizeRule(2,'long', 'banana', )
+        with self.assertRaises(ValueError):
+            ss_rule=BarzilaiBorweinStepSizeRule(2, 'banana',3 )
+
+
+        #Check stabilisation parameter unchanged if fixed 
+        ss_rule=BarzilaiBorweinStepSizeRule(2, 'long',3 )
+        self.assertEqual(ss_rule.mode, 'long')
+        self.assertFalse(ss_rule.adaptive)
+        alg = GD(initial=initial, objective_function=f, step_size=ss_rule)
+        self.assertEqual(ss_rule.stabilisation_param,3)
+        alg.run(2)
+        self.assertEqual(ss_rule.stabilisation_param,3)
+
+        #Check infinity can be passed 
+        ss_rule=BarzilaiBorweinStepSizeRule(2, 'short',"off" )
+        self.assertEqual(ss_rule.mode, 'short')
+        self.assertFalse(ss_rule.adaptive)
+        self.assertEqual(ss_rule.stabilisation_param,np.inf)
+        alg = GD(initial=initial, objective_function=f, step_size=ss_rule)
+        alg.run(2)
+
+        n = 5
+        m = 5
+
+        A = np.eye(5).astype('float32')
+        b = (np.array([.5,.5,.5,.5,.5])).astype('float32')
+
+        Aop = MatrixOperator(A)
+        bop = VectorData(b)
+        ig=Aop.domain
+        initial = ig.allocate(0)
+        f = LeastSquares(Aop, b=bop, c=0.5)
+        ss_rule=BarzilaiBorweinStepSizeRule(0.22, 'long',np.inf )
+        alg = GD(initial=initial, objective_function=f, step_size=ss_rule)
+        self.assertFalse(ss_rule.is_short)
+        #Check the initial step size was used
+        alg.run(1)
+        self.assertNumpyArrayAlmostEqual( np.array([.11,.11,.11,.11,.11]), alg.x.as_array() )
+        self.assertFalse(ss_rule.is_short)
+        #check long 
+        alg.run(1)
+        x_change= np.array([.11,.11,.11,.11,.11])-np.array([0,0,0,0,0])
+        grad_change = -np.array([.39,.39,.39,.39,.39])+np.array([.5,.5,.5,.5,.5])
+        step= x_change.dot(x_change)/x_change.dot(grad_change)
+        self.assertNumpyArrayAlmostEqual( np.array([.11,.11,.11,.11,.11])+step*np.array([.39,.39,.39,.39,.39]), alg.x.as_array() )
+        self.assertFalse(ss_rule.is_short)
+
+        ss_rule=BarzilaiBorweinStepSizeRule(0.22, 'short',np.inf )
+        alg = GD(initial=initial, objective_function=f, step_size=ss_rule)
+        self.assertTrue(ss_rule.is_short)
+        #Check the initial step size was used
+        alg.run(1)
+        self.assertNumpyArrayAlmostEqual( np.array([.11,.11,.11,.11,.11]), alg.x.as_array() )
+        self.assertTrue(ss_rule.is_short)
+        #check short
+        alg.run(1)
+        x_change= np.array([.11,.11,.11,.11,.11])-np.array([0,0,0,0,0])
+        grad_change = -np.array([.39,.39,.39,.39,.39])+np.array([.5,.5,.5,.5,.5])
+        step= x_change.dot(grad_change)/grad_change.dot(grad_change)
+        self.assertNumpyArrayAlmostEqual( np.array([.11,.11,.11,.11,.11])+step*np.array([.39,.39,.39,.39,.39]), alg.x.as_array() )
+        self.assertTrue(ss_rule.is_short)
+
+        #check stop iteration 
+        ss_rule=BarzilaiBorweinStepSizeRule(1, 'long',np.inf )
+        alg = GD(initial=initial, objective_function=f, step_size=ss_rule)
+        alg.run(500)
+        self.assertEqual(alg.iteration, 1)
+
+        #check adaptive
+        ss_rule=BarzilaiBorweinStepSizeRule(0.001, 'long',"auto")
+        alg = GD(initial=initial, objective_function=f, step_size=ss_rule)
+        self.assertEqual(ss_rule.stabilisation_param, np.inf)
+        alg.run(2)
+        self.assertNotEqual(ss_rule.stabilisation_param, np.inf)
+
+        #check stops being adaptive 
+
+        ss_rule=BarzilaiBorweinStepSizeRule(0.0000001, 'long',"auto" )
+        alg = GD(initial=initial, objective_function=f, step_size=ss_rule)
+        self.assertEqual(ss_rule.stabilisation_param, np.inf)
+        alg.run(4)
+        self.assertNotEqual(ss_rule.stabilisation_param, np.inf)
+        a=ss_rule.stabilisation_param
+        alg.run(1)
+        self.assertEqual(ss_rule.stabilisation_param, a)
+
+        #Test alternating
+        ss_rule=BarzilaiBorweinStepSizeRule(0.0000001, 'alternate',"auto" )
+        alg = GD(initial=initial, objective_function=f, step_size=ss_rule)
+        self.assertFalse(ss_rule.is_short)
+        alg.run(2)
+        self.assertTrue(ss_rule.is_short)
+        alg.run(1)
+        self.assertFalse(ss_rule.is_short)
+        alg.run(1)
+        self.assertTrue(ss_rule.is_short)
+
+
+
+
+    def test_bb_converge(self):
+        n = 10
+        m = 5
+        np.random.seed(4)
+        A = np.random.uniform(0, 1, (m, n)).astype('float32')
+        b = (A.dot(np.random.randn(n)) + 0.1 *
+             np.random.randn(m)).astype('float32')
+
+        Aop = MatrixOperator(A)
+        bop = VectorData(b)
+        ig=Aop.domain
+        initial = ig.allocate()
+        f = LeastSquares(Aop, b=bop, c=2)
+
+        ss_rule=ArmijoStepSizeRule(max_iterations=40)
+        alg_true = GD(initial=initial, objective_function=f, step_size=ss_rule)
+        alg_true .run(300, verbose=0)
+
+
+
+        ss_rule=BarzilaiBorweinStepSizeRule(1/f.L, 'short')
+        alg = GD(initial=initial, objective_function=f, step_size=ss_rule)
+        alg.run(80, verbose=0)
+        self.assertNumpyArrayAlmostEqual(alg.x.as_array(), alg_true.x.as_array(), decimal=3)
+
+
+        ss_rule=BarzilaiBorweinStepSizeRule(1/f.L, 'long')
+        alg = GD(initial=initial, objective_function=f, step_size=ss_rule)
+        alg.run(80, verbose=0)
+        self.assertNumpyArrayAlmostEqual(alg.x.as_array(), alg_true.x.as_array(), decimal=3)
+
+
+        ss_rule=BarzilaiBorweinStepSizeRule(1/f.L, 'alternate')
+        alg = GD(initial=initial, objective_function=f, step_size=ss_rule)
+
+        alg.run(80, verbose=0)
+        self.assertNumpyArrayAlmostEqual(alg.x.as_array(), alg_true.x.as_array(), decimal=3)
+
+
diff --git a/docs/source/optimisation.rst b/docs/source/optimisation.rst
@@ -664,6 +664,10 @@ We also have a number of example classes:
 .. autoclass:: cil.optimisation.utilities.StepSizeMethods.ArmijoStepSizeRule
    :members:
 
+.. autoclass:: cil.optimisation.utilities.StepSizeMethods.BarzilaiBorweinStepSizeRule
+   :members:
+
+
 
 Preconditioners
 ----------------