From 6bf107c61a76da76049002edcca4088d3113a661 Mon Sep 17 00:00:00 2001
From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com>
Date: Wed, 7 Jul 2021 00:17:45 +0200
Subject: [PATCH 01/13] fixing merge mess

---
 src/fw_algorithms.jl | 1033 +++++++++++++++++++++---------------------
 1 file changed, 517 insertions(+), 516 deletions(-)

diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index 846ea3f5b..7ad67243f 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -1,6 +1,6 @@
 
 """
-    frank_wolfe(f, grad!, lmo, x0; ...)
+frank_wolfe(f, grad!, lmo, x0; ...)
 
 Simplest form of the Frank-Wolfe algorithm.
 Returns a tuple `(x, v, primal, dual_gap, traj_data)` with:
@@ -11,205 +11,179 @@ Returns a tuple `(x, v, primal, dual_gap, traj_data)` with:
 - `traj_data` vector of trajectory information.
 """
 function frank_wolfe(
-    f,
-    grad!,
-    lmo,
-    x0;
-    line_search::LineSearchMethod=Adaptive(),
-    L=Inf,
-    gamma0=0,
-    step_lim=20,
-    momentum=nothing,
-    epsilon=1e-7,
-    max_iteration=10000,
-    print_iter=1000,
-    trajectory=false,
-    verbose=false,
-    linesearch_tol=1e-7,
-    emphasis::Emphasis=memory,
-    gradient=nothing,
-    callback=nothing,
-    timeout=Inf,
-    print_callback=FrankWolfe.print_callback,
+f,
+grad!,
+lmo,
+x0;
+line_search::LineSearchMethod=Adaptive(),
+L=Inf,
+gamma0=0,
+step_lim=20,
+momentum=nothing,
+epsilon=1e-7,
+max_iteration=10000,
+print_iter=1000,
+trajectory=false,
+verbose=false,
+linesearch_tol=1e-7,
+emphasis::Emphasis=memory,
+gradient=nothing,
+callback=nothing,
+timeout=Inf,
+print_callback=FrankWolfe.print_callback,
 )
 
-    # format string for output of the algorithm
-    format_string = "%6s %13s %14e %14e %14e %14e %14e\n"
-    t = 0
-    dual_gap = Inf
-    primal = Inf
-    v = []
-    x = x0
-    tt = regular
-    traj_data = []
-    if trajectory && callback === nothing
-        callback = trajectory_callback(traj_data)
-    end
-    time_start = time_ns()
+# format string for output of the algorithm
+format_string = "%6s %13s %14e %14e %14e %14e %14e\n"
+t = 0
+dual_gap = Inf
+primal = Inf
+v = []
+x = x0
+tt = regular
+traj_data = []
+if trajectory && callback === nothing
+    callback = trajectory_callback(traj_data)
+end
+time_start = time_ns()
 
-    if line_search isa Shortstep && !isfinite(L)
-        println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.")
-    end
+if line_search isa Shortstep && !isfinite(L)
+    println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.")
+end
 
-    if line_search isa FixedStep && gamma0 == 0
-        println("FATAL: gamma0 not set. We are not going to move a single bit.")
-    end
+if line_search isa FixedStep && gamma0 == 0
+    println("FATAL: gamma0 not set. We are not going to move a single bit.")
+end
 
-    if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep})
-        println(
-            "WARNING: Momentum-averaged gradients should usually be used with agnostic stepsize rules.",
-        )
-    end
+if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep})
+    println(
+        "WARNING: Momentum-averaged gradients should usually be used with agnostic stepsize rules.",
+    )
+end
 
-    if verbose
-        println("\nVanilla Frank-Wolfe Algorithm.")
-        numType = eltype(x0)
-        println(
-            "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $numType",
-        )
-        grad_type = typeof(gradient)
-        println("MOMENTUM: $momentum GRADIENTTYPE: $grad_type")
-        if emphasis === memory
-            println("WARNING: In memory emphasis mode iterates are written back into x0!")
-        end
-        headers = ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec"]
-        print_callback(headers, format_string, print_header=true)
+if verbose
+    println("\nVanilla Frank-Wolfe Algorithm.")
+    numType = eltype(x0)
+    println(
+        "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $numType",
+    )
+    grad_type = typeof(gradient)
+    println("MOMENTUM: $momentum GRADIENTTYPE: $grad_type")
+    if emphasis === memory
+        println("WARNING: In memory emphasis mode iterates are written back into x0!")
     end
+    headers = ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec"]
+    print_callback(headers, format_string, print_header=true)
+end
 
-    if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray})
-        # if integer, convert element type to most appropriate float
-        if eltype(x) <: Integer
-            x = convert(Array{float(eltype(x))}, x)
-        else
-            x = convert(Array{eltype(x)}, x)
-        end
-    end
-    first_iter = true
-    # instanciating container for gradient
-    if gradient === nothing
-        gradient = similar(x)
+if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray})
+    # if integer, convert element type to most appropriate float
+    if eltype(x) <: Integer
+        x = convert(Array{float(eltype(x))}, x)
+    else
+        x = convert(Array{eltype(x)}, x)
     end
+end
+first_iter = true
+# instanciating container for gradient
+if gradient === nothing
+    gradient = similar(x)
+end
 
-    # container for direction
-    d = similar(x)
-    gtemp = if momentum === nothing
-        nothing
-    else
-        similar(x)
+# container for direction
+d = similar(x)
+gtemp = if momentum === nothing
+    nothing
+else
+    similar(x)
+end
+while t <= max_iteration && dual_gap >= max(epsilon, eps())
+
+    #####################
+    # managing time and Ctrl-C
+    #####################
+    time_at_loop = time_ns()
+    if t == 0
+        time_start = time_at_loop
     end
-    while t <= max_iteration && dual_gap >= max(epsilon, eps())
+    # time is measured at beginning of loop for consistency throughout all algorithms
+    tot_time = (time_at_loop - time_start) / 1e9
 
-        #####################
-        # managing time and Ctrl-C
-        #####################
-        time_at_loop = time_ns()
-        if t == 0
-            time_start = time_at_loop
-        end
-        # time is measured at beginning of loop for consistency throughout all algorithms
-        tot_time = (time_at_loop - time_start) / 1e9
-
-        if timeout < Inf
-            if tot_time ≥ timeout
-                if verbose
-                    @info "Time limit reached"
-                end
-                break
+    if timeout < Inf
+        if tot_time ≥ timeout
+            if verbose
+                @info "Time limit reached"
             end
+            break
         end
+    end
 
-        #####################
+    #####################
 
 
-        if momentum === nothing || first_iter
-            grad!(gradient, x)
-            if momentum !== nothing
-                gtemp .= gradient
-            end
-        else
-            grad!(gtemp, x)
-            @emphasis(emphasis, gradient = (momentum * gradient) + (1 - momentum) * gtemp)
-        end
-        first_iter = false
-
-        v = compute_extreme_point(lmo, gradient)
-        # go easy on the memory - only compute if really needed
-        if (
-            (mod(t, print_iter) == 0 && verbose) ||
-            callback !== nothing ||
-            line_search isa Shortstep
-        )
-            primal = f(x)
-            dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+    if momentum === nothing || first_iter
+        grad!(gradient, x)
+        if momentum !== nothing
+            gtemp .= gradient
         end
+    else
+        grad!(gtemp, x)
+        @emphasis(emphasis, gradient = (momentum * gradient) + (1 - momentum) * gtemp)
+    end
+    first_iter = false
 
-        if callback !== nothing
-            state = (
-                t=t,
-                primal=primal,
-                dual=primal - dual_gap,
-                dual_gap=dual_gap,
-                time=tot_time,
-                x=x,
-                v=v,
-            )
-            callback(state)
-        end
-        @emphasis(emphasis, d = x - v)
+    v = compute_extreme_point(lmo, gradient)
+    # go easy on the memory - only compute if really needed
+    if (
+        (mod(t, print_iter) == 0 && verbose) ||
+        callback !== nothing ||
+        line_search isa Shortstep
+    )
+        primal = f(x)
+        dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+    end
 
-        gamma, L = line_search_wrapper(
-            line_search,
-            t,
-            f,
-            grad!,
-            x,
-            d,
-            momentum === nothing ? gradient : gtemp, # use appropriate storage
-            dual_gap,
-            L,
-            gamma0,
-            linesearch_tol,
-            step_lim,
-            one(eltype(x)),
+    if callback !== nothing
+        state = (
+            t=t,
+            primal=primal,
+            dual=primal - dual_gap,
+            dual_gap=dual_gap,
+            time=tot_time,
+            x=x,
+            v=v,
         )
-
-        @emphasis(emphasis, x = x - gamma * d)
-
-        if (mod(t, print_iter) == 0 && verbose)
-            tt = regular
-            if t == 0
-                tt = initial
-            end
-
-            rep = (
-                st[Symbol(tt)],
-                string(t),
-                Float64(primal),
-                Float64(primal - dual_gap),
-                Float64(dual_gap),
-                tot_time,
-                t / tot_time,
-            )
-            print_callback(rep, format_string)
-
-            flush(stdout)
-        end
-        t = t + 1
+        callback(state)
     end
-    # recompute everything once for final verfication / do not record to trajectory though for now!
-    # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
-    # hence the final computation.
+    @emphasis(emphasis, d = x - v)
+
+    gamma, L = line_search_wrapper(
+        line_search,
+        t,
+        f,
+        grad!,
+        x,
+        d,
+        momentum === nothing ? gradient : gtemp, # use appropriate storage
+        dual_gap,
+        L,
+        gamma0,
+        linesearch_tol,
+        step_lim,
+        one(eltype(x)),
+    )
+
+    @emphasis(emphasis, x = x - gamma * d)
+
+    if (mod(t, print_iter) == 0 && verbose)
+        tt = regular
+        if t == 0
+            tt = initial
+        end
 
-    grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient)
-    primal = f(x)
-    dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-    if verbose
-        tt = last
-        tot_time = (time_ns() - time_start) / 1.0e9
         rep = (
             st[Symbol(tt)],
-            string(t - 1),
+            string(t),
             Float64(primal),
             Float64(primal - dual_gap),
             Float64(dual_gap),
@@ -217,15 +191,41 @@ function frank_wolfe(
             t / tot_time,
         )
         print_callback(rep, format_string)
-        print_callback(nothing, format_string, print_footer=true)
+
         flush(stdout)
     end
-    return x, v, primal, dual_gap, traj_data
+    t = t + 1
+end
+# recompute everything once for final verfication / do not record to trajectory though for now!
+# this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
+# hence the final computation.
+
+grad!(gradient, x)
+v = compute_extreme_point(lmo, gradient)
+primal = f(x)
+dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+if verbose
+    tt = last
+    tot_time = (time_ns() - time_start) / 1.0e9
+    rep = (
+        st[Symbol(tt)],
+        string(t - 1),
+        Float64(primal),
+        Float64(primal - dual_gap),
+        Float64(dual_gap),
+        tot_time,
+        t / tot_time,
+    )
+    print_callback(rep, format_string)
+    print_callback(nothing, format_string, print_footer=true)
+    flush(stdout)
+end
+return x, v, primal, dual_gap, traj_data
 end
 
 
 """
-    lazified_conditional_gradient
+lazified_conditional_gradient
 
 Similar to [`frank_wolfe`](@ref) but lazyfying the LMO:
 each call is stored in a cache, which is looked up first for a good-enough direction.
@@ -233,196 +233,170 @@ The cache used is a [`FrankWolfe.MultiCacheLMO`](@ref) or a [`FrankWolfe.VectorC
 depending on whether the provided `cache_size` option is finite.
 """
 function lazified_conditional_gradient(
-    f,
-    grad!,
-    lmo_base,
-    x0;
-    line_search::LineSearchMethod=Adaptive(),
-    L=Inf,
-    gamma0=0,
-    K=2.0,
-    cache_size=Inf,
-    greedy_lazy=false,
-    epsilon=1e-7,
-    max_iteration=10000,
-    print_iter=1000,
-    trajectory=false,
-    verbose=false,
-    linesearch_tol=1e-7,
-    step_lim=20,
-    emphasis::Emphasis=memory,
-    gradient=nothing,
-    VType=typeof(x0),
-    callback=nothing,
-    timeout=Inf,
-    print_callback=FrankWolfe.print_callback,
+f,
+grad!,
+lmo_base,
+x0;
+line_search::LineSearchMethod=Adaptive(),
+L=Inf,
+gamma0=0,
+K=2.0,
+cache_size=Inf,
+greedy_lazy=false,
+epsilon=1e-7,
+max_iteration=10000,
+print_iter=1000,
+trajectory=false,
+verbose=false,
+verbose_it=false,
+linesearch_tol=1e-7,
+step_lim=20,
+emphasis::Emphasis=memory,
+gradient=nothing,
+VType=typeof(x0),
+callback=nothing,
+timeout=Inf,
+print_callback=FrankWolfe.print_callback,
 )
 
-    # format string for output of the algorithm
-    format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n"
+# format string for output of the algorithm
+format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n"
 
-    if isfinite(cache_size)
-        lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base)
-    else
-        lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base)
-    end
+if isfinite(cache_size)
+    lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base)
+else
+    lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base)
+end
 
-    t = 0
-    dual_gap = Inf
-    primal = Inf
-    v = []
-    x = x0
-    phi = Inf
-    traj_data = []
-    if trajectory && callback === nothing
-        callback = trajectory_callback(traj_data)
-    end
-    tt = regular
-    time_start = time_ns()
+t = 0
+dual_gap = Inf
+primal = Inf
+v = []
+x = x0
+phi = Inf
+traj_data = []
+if trajectory && callback === nothing
+    callback = trajectory_callback(traj_data)
+end
+tt = regular
+time_start = time_ns()
 
-    if line_search isa Shortstep && !isfinite(L)
-        println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.")
-    end
+if line_search isa Shortstep && !isfinite(L)
+    println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.")
+end
 
-    if line_search isa Agnostic || line_search isa Nonconvex
-        println("FATAL: Lazification is not known to converge with open-loop step size strategies.")
-    end
+if line_search isa Agnostic || line_search isa Nonconvex
+    println("FATAL: Lazification is not known to converge with open-loop step size strategies.")
+end
 
-    if verbose
-        println("\nLazified Conditional Gradients (Frank-Wolfe + Lazification).")
-        numType = eltype(x0)
-        println(
-            "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration K: $K TYPE: $numType",
-        )
-        grad_type = typeof(gradient)
-        println("GRADIENTTYPE: $grad_type CACHESIZE $cache_size GREEDYCACHE: $greedy_lazy")
-        if emphasis == memory
-            println("WARNING: In memory emphasis mode iterates are written back into x0!")
-        end
-        headers =
-            ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec", "Cache Size"]
-        print_callback(headers, format_string, print_header=true)
+if verbose
+    println("\nLazified Conditional Gradients (Frank-Wolfe + Lazification).")
+    numType = eltype(x0)
+    println(
+        "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration K: $K TYPE: $numType",
+    )
+    grad_type = typeof(gradient)
+    println("GRADIENTTYPE: $grad_type CACHESIZE $cache_size GREEDYCACHE: $greedy_lazy")
+    if emphasis == memory
+        println("WARNING: In memory emphasis mode iterates are written back into x0!")
     end
+    headers =
+        ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec", "Cache Size"]
+    print_callback(headers, format_string, print_header=true)
+end
 
-    if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray})
-        x = convert(Array{float(eltype(x))}, x)
-    end
+if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray})
+    x = convert(Array{float(eltype(x))}, x)
+end
 
-    if gradient === nothing
-        gradient = similar(x)
-    end
+if gradient === nothing
+    gradient = similar(x)
+end
 
-    # container for direction
-    d = similar(x)
+# container for direction
+d = similar(x)
 
-    while t <= max_iteration && dual_gap >= max(epsilon, eps())
+while t <= max_iteration && dual_gap >= max(epsilon, eps())
 
-        #####################
-        # managing time and Ctrl-C
-        #####################
-        time_at_loop = time_ns()
-        if t == 0
-            time_start = time_at_loop
-        end
-        # time is measured at beginning of loop for consistency throughout all algorithms
-        tot_time = (time_at_loop - time_start) / 1e9
-
-        if timeout < Inf
-            if tot_time ≥ timeout
-                if verbose
-                    @info "Time limit reached"
-                end
-                break
+    #####################
+    # managing time and Ctrl-C
+    #####################
+    time_at_loop = time_ns()
+    if t == 0
+        time_start = time_at_loop
+    end
+    # time is measured at beginning of loop for consistency throughout all algorithms
+    tot_time = (time_at_loop - time_start) / 1e9
+
+    if timeout < Inf
+        if tot_time ≥ timeout
+            if verbose
+                @info "Time limit reached"
             end
+            break
         end
+    end
 
-        #####################
-
-        grad!(gradient, x)
-
-        threshold = fast_dot(x, gradient) - phi / K
+    #####################
 
-        # go easy on the memory - only compute if really needed
-        if ((mod(t, print_iter) == 0 && verbose ) || callback !== nothing)
-            primal = f(x)
-        end
+    grad!(gradient, x)
 
-        v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy)
-        tt = lazy
-        if fast_dot(v, gradient) > threshold
-            tt = dualstep
-            dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-            phi = min(dual_gap, phi / 2)
-        end
+    threshold = fast_dot(x, gradient) - phi / K
 
-        if callback !== nothing
-            state = (
-                t=t,
-                primal=primal,
-                dual=primal - dual_gap,
-                dual_gap=dual_gap,
-                time=tot_time,
-                cache_size=length(lmo),
-                x=x,
-                v=v,
-            )
-            callback(state)
-        end
+    # go easy on the memory - only compute if really needed
+    if ((mod(t, print_iter) == 0 && (verbose || verbose_it) ) || callback !== nothing)
+        primal = f(x)
+    end
 
-        @emphasis(emphasis, d = x - v)
+    v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy)
+    tt = lazy
+    if fast_dot(v, gradient) > threshold
+        tt = dualstep
+        dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+        phi = min(dual_gap, phi / 2)
+    end
 
-        gamma, L = line_search_wrapper(
-            line_search,
-            t,
-            f,
-            grad!,
-            x,
-            d,
-            gradient,
-            dual_gap,
-            L,
-            gamma0,
-            linesearch_tol,
-            step_lim,
-            1.0,
+    if callback !== nothing
+        state = (
+            t=t,
+            primal=primal,
+            dual=primal - dual_gap,
+            dual_gap=dual_gap,
+            time=tot_time,
+            cache_size=length(lmo),
+            x=x,
+            v=v,
         )
-
-        @emphasis(emphasis, x = x - gamma * d)
-
-        if verbose && (mod(t, print_iter) == 0 || tt == dualstep)
-            if t == 0
-                tt = initial
-            end
-            rep = (
-                st[Symbol(tt)],
-                string(t),
-                Float64(primal),
-                Float64(primal - dual_gap),
-                Float64(dual_gap),
-                tot_time,
-                t / tot_time,
-                length(lmo),
-            )
-            print_callback(rep, format_string)
-            flush(stdout)
-        end
-        t += 1
+        callback(state)
     end
 
-    # recompute everything once for final verfication / do not record to trajectory though for now!
-    # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
-    # hence the final computation.
-    grad!(gradient, x)
-    v = compute_extreme_point(lmo, gradient)
-    primal = f(x)
-    dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-
-    if verbose 
-        tt = last
-        tot_time = (time_ns() - time_start) / 1.0e9
+    @emphasis(emphasis, d = x - v)
+
+    gamma, L = line_search_wrapper(
+        line_search,
+        t,
+        f,
+        grad!,
+        x,
+        d,
+        gradient,
+        dual_gap,
+        L,
+        gamma0,
+        linesearch_tol,
+        step_lim,
+        1.0,
+    )
+
+    @emphasis(emphasis, x = x - gamma * d)
+
+    if (verbose  || verbose_it) && (mod(t, print_iter) == 0 || tt == dualstep)
+        if t == 0
+            tt = initial
+        end
         rep = (
             st[Symbol(tt)],
-            string(t - 1),
+            string(t),
             Float64(primal),
             Float64(primal - dual_gap),
             Float64(dual_gap),
@@ -431,208 +405,209 @@ function lazified_conditional_gradient(
             length(lmo),
         )
         print_callback(rep, format_string)
-        if verbose
-            print_callback(nothing, format_string, print_footer=true)
-        end
         flush(stdout)
     end
-    return x, v, primal, dual_gap, traj_data
+    t += 1
+end
+
+# recompute everything once for final verfication / do not record to trajectory though for now!
+# this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
+# hence the final computation.
+grad!(gradient, x)
+v = compute_extreme_point(lmo, gradient)
+primal = f(x)
+dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+
+if verbose  || verbose_it
+    tt = last
+    tot_time = (time_ns() - time_start) / 1.0e9
+    rep = (
+        st[Symbol(tt)],
+        string(t - 1),
+        Float64(primal),
+        Float64(primal - dual_gap),
+        Float64(dual_gap),
+        tot_time,
+        t / tot_time,
+        length(lmo),
+    )
+    print_callback(rep, format_string)
+    if verbose
+        print_callback(nothing, format_string, print_footer=true)
+    end
+    flush(stdout)
+end
+return x, v, primal, dual_gap, traj_data
 end
 
 """
-    stochastic_frank_wolfe(f::StochasticObjective, lmo, x0; ...)
+stochastic_frank_wolfe(f::StochasticObjective, lmo, x0; ...)
 
 Stochastic version of Frank-Wolfe, evaluates the objective and gradient stochastically,
 implemented through the [FrankWolfe.StochasticObjective](@ref) interface.
 """
 function stochastic_frank_wolfe(
-    f::StochasticObjective,
-    lmo,
-    x0;
-    line_search::LineSearchMethod=nonconvex,
-    L=Inf,
-    gamma0=0,
-    step_lim=20,
-    momentum=nothing,
-    epsilon=1e-7,
-    max_iteration=10000,
-    print_iter=1000,
-    trajectory=false,
-    verbose=false,
-    linesearch_tol=1e-7,
-    emphasis::Emphasis=blas,
-    rng=Random.GLOBAL_RNG,
-    batch_size=length(f.xs) ÷ 10 + 1,
-    full_evaluation=false,
-    callback=nothing,
-    timeout=Inf,
-    print_callback=FrankWolfe.print_callback,
+f::StochasticObjective,
+lmo,
+x0;
+line_search::LineSearchMethod=nonconvex,
+L=Inf,
+gamma0=0,
+step_lim=20,
+momentum=nothing,
+epsilon=1e-7,
+max_iteration=10000,
+print_iter=1000,
+trajectory=false,
+verbose=false,
+linesearch_tol=1e-7,
+emphasis::Emphasis=blas,
+rng=Random.GLOBAL_RNG,
+batch_size=length(f.xs) ÷ 10 + 1,
+full_evaluation=false,
+callback=nothing,
+timeout=Inf,
+print_callback=FrankWolfe.print_callback,
 )
 
-    # format string for output of the algorithm
-    format_string = "%6s %13s %14e %14e %14e %14e %14e\n"
-
-    t = 0
-    dual_gap = Inf
-    primal = Inf
-    v = []
-    x = x0
-    tt = regular
-    traj_data = []
-    if trajectory && callback === nothing
-        callback = trajectory_callback(traj_data)
-    end
-    dx = similar(x0) # Array{eltype(x0)}(undef, length(x0))
-    time_start = time_ns()
+# format string for output of the algorithm
+format_string = "%6s %13s %14e %14e %14e %14e %14e\n"
+
+t = 0
+dual_gap = Inf
+primal = Inf
+v = []
+x = x0
+tt = regular
+traj_data = []
+if trajectory && callback === nothing
+    callback = trajectory_callback(traj_data)
+end
+dx = similar(x0) # Array{eltype(x0)}(undef, length(x0))
+time_start = time_ns()
 
-    if line_search == Shortstep && L == Inf
-        println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.")
-    end
+if line_search == Shortstep && L == Inf
+    println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.")
+end
 
-    if line_search == FixedStep && gamma0 == 0
-        println("FATAL: gamma0 not set. We are not going to move a single bit.")
-    end
+if line_search == FixedStep && gamma0 == 0
+    println("FATAL: gamma0 not set. We are not going to move a single bit.")
+end
 
-    if verbose
-        println("\nStochastic Frank-Wolfe Algorithm.")
-        numType = eltype(x0)
-        println(
-            "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon max_iteration: $max_iteration TYPE: $numType",
-        )
-        # TODO: needs to fix
-        grad_type = typeof(nothing)
-        println("GRADIENTTYPE: $grad_type MOMENTUM: $momentum BATCHSIZE: $batch_size ")
-        if emphasis == memory
-            println("WARNING: In memory emphasis mode iterates are written back into x0!")
-        end
-        headers = ("Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec")
-        print_callback(headers, format_string, print_header=true)
+if verbose
+    println("\nStochastic Frank-Wolfe Algorithm.")
+    numType = eltype(x0)
+    println(
+        "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon max_iteration: $max_iteration TYPE: $numType",
+    )
+    # TODO: needs to fix
+    grad_type = typeof(nothing)
+    println("GRADIENTTYPE: $grad_type MOMENTUM: $momentum BATCHSIZE: $batch_size ")
+    if emphasis == memory
+        println("WARNING: In memory emphasis mode iterates are written back into x0!")
     end
+    headers = ("Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec")
+    print_callback(headers, format_string, print_header=true)
+end
 
-    if emphasis == memory && !isa(x, Array)
-        x = convert(Array{promote_type(eltype(x), Float64)}, x)
+if emphasis == memory && !isa(x, Array)
+    x = convert(Array{promote_type(eltype(x), Float64)}, x)
+end
+first_iter = true
+gradient = 0
+while t <= max_iteration && dual_gap >= max(epsilon, eps())
+
+    #####################
+    # managing time and Ctrl-C
+    #####################
+    time_at_loop = time_ns()
+    if t == 0
+        time_start = time_at_loop
     end
-    first_iter = true
-    gradient = 0
-    while t <= max_iteration && dual_gap >= max(epsilon, eps())
-
-        #####################
-        # managing time and Ctrl-C
-        #####################
-        time_at_loop = time_ns()
-        if t == 0
-            time_start = time_at_loop
-        end
-        # time is measured at beginning of loop for consistency throughout all algorithms
-        tot_time = (time_at_loop - time_start) / 1e9
-
-        if timeout < Inf
-            if tot_time ≥ timeout
-                if verbose
-                    @info "Time limit reached"
-                end
-                break
-            end
-        end
+    # time is measured at beginning of loop for consistency throughout all algorithms
+    tot_time = (time_at_loop - time_start) / 1e9
 
-        #####################
-
-        if momentum === nothing || first_iter
-            gradient = compute_gradient(
-                f,
-                x,
-                rng=rng,
-                batch_size=batch_size,
-                full_evaluation=full_evaluation,
-            )
-        else
-            @emphasis(
-                emphasis,
-                gradient =
-                    (momentum * gradient) .+
-                    (1 - momentum) * compute_gradient(
-                        f,
-                        x,
-                        rng=rng,
-                        batch_size=batch_size,
-                        full_evaluation=full_evaluation,
-                    )
-            )
+    if timeout < Inf
+        if tot_time ≥ timeout
+            if verbose
+                @info "Time limit reached"
+            end
+            break
         end
-        first_iter = false
+    end
 
-        v = compute_extreme_point(lmo, gradient)
+    #####################
 
-        # go easy on the memory - only compute if really needed
-        if (mod(t, print_iter) == 0 && verbose) ||
-           callback !== nothing ||
-           !(line_search isa Agnostic || line_search isa Nonconvex || line_search isa FixedStep)
-            primal = compute_value(f, x, full_evaluation=true)
-            dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-        end
+    if momentum === nothing || first_iter
+        gradient = compute_gradient(
+            f,
+            x,
+            rng=rng,
+            batch_size=batch_size,
+            full_evaluation=full_evaluation,
+        )
+    else
+        @emphasis(
+            emphasis,
+            gradient =
+                (momentum * gradient) .+
+                (1 - momentum) * compute_gradient(
+                    f,
+                    x,
+                    rng=rng,
+                    batch_size=batch_size,
+                    full_evaluation=full_evaluation,
+                )
+        )
+    end
+    first_iter = false
 
-        if callback !== nothing
-            state = (
-                t=t,
-                primal=primal,
-                dual=primal - dual_gap,
-                dual_gap=dual_gap,
-                time=tot_time,
-                x=x,
-                v=v,
-            )
-            callback(state)
-        end
+    v = compute_extreme_point(lmo, gradient)
 
-        if line_search isa Agnostic
-            gamma = 2 // (2 + t)
-        elseif line_search isa Nonconvex
-            gamma = 1 / sqrt(t + 1)
-        elseif line_search isa Shortstep
-            gamma = dual_gap / (L * norm(x - v)^2)
-        elseif line_search isa RationalShortstep
-            rat_dual_gap = sum((x - v) .* gradient)
-            gamma = rat_dual_gap // (L * sum((x - v) .^ 2))
-        elseif line_search isa FixedStep
-            gamma = gamma0
-        end
+    # go easy on the memory - only compute if really needed
+    if (mod(t, print_iter) == 0 && verbose) ||
+       callback !== nothing ||
+       !(line_search isa Agnostic || line_search isa Nonconvex || line_search isa FixedStep)
+        primal = compute_value(f, x, full_evaluation=true)
+        dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+    end
 
-        @emphasis(emphasis, x = (1 - gamma) * x + gamma * v)
+    if callback !== nothing
+        state = (
+            t=t,
+            primal=primal,
+            dual=primal - dual_gap,
+            dual_gap=dual_gap,
+            time=tot_time,
+            x=x,
+            v=v,
+        )
+        callback(state)
+    end
 
-        if mod(t, print_iter) == 0 && verbose
-            tt = regular
-            if t == 0
-                tt = initial
-            end
-            rep = (
-                st[Symbol(tt)],
-                string(t),
-                Float64(primal),
-                Float64(primal - dual_gap),
-                Float64(dual_gap),
-                tot_time,
-                t / tot_time,
-            )
-            print_callback(rep, format_string)
-            flush(stdout)
-        end
-        t += 1
+    if line_search isa Agnostic
+        gamma = 2 // (2 + t)
+    elseif line_search isa Nonconvex
+        gamma = 1 / sqrt(t + 1)
+    elseif line_search isa Shortstep
+        gamma = dual_gap / (L * norm(x - v)^2)
+    elseif line_search isa RationalShortstep
+        rat_dual_gap = sum((x - v) .* gradient)
+        gamma = rat_dual_gap // (L * sum((x - v) .^ 2))
+    elseif line_search isa FixedStep
+        gamma = gamma0
     end
-    # recompute everything once for final verfication / no additional callback call
-    # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
-    # hence the final computation.
-    # last computation done with full evaluation for exact gradient
 
-    (primal, gradient) = compute_value_gradient(f, x, full_evaluation=true)
-    v = compute_extreme_point(lmo, gradient)
-    # @show (gradient, primal)
-    dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-    if verbose
-        tt = last
-        tot_time = (time_ns() - time_start) / 1.0e9
+    @emphasis(emphasis, x = (1 - gamma) * x + gamma * v)
+
+    if mod(t, print_iter) == 0 && verbose
+        tt = regular
+        if t == 0
+            tt = initial
+        end
         rep = (
             st[Symbol(tt)],
-            string(t - 1),
+            string(t),
             Float64(primal),
             Float64(primal - dual_gap),
             Float64(dual_gap),
@@ -640,8 +615,34 @@ function stochastic_frank_wolfe(
             t / tot_time,
         )
         print_callback(rep, format_string)
-        print_callback(nothing, format_string, print_footer=true)
         flush(stdout)
     end
-    return x, v, primal, dual_gap, traj_data
+    t += 1
+end
+# recompute everything once for final verfication / no additional callback call
+# this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
+# hence the final computation.
+# last computation done with full evaluation for exact gradient
+
+(primal, gradient) = compute_value_gradient(f, x, full_evaluation=true)
+v = compute_extreme_point(lmo, gradient)
+# @show (gradient, primal)
+dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+if verbose
+    tt = last
+    tot_time = (time_ns() - time_start) / 1.0e9
+    rep = (
+        st[Symbol(tt)],
+        string(t - 1),
+        Float64(primal),
+        Float64(primal - dual_gap),
+        Float64(dual_gap),
+        tot_time,
+        t / tot_time,
+    )
+    print_callback(rep, format_string)
+    print_callback(nothing, format_string, print_footer=true)
+    flush(stdout)
+end
+return x, v, primal, dual_gap, traj_data
 end

From 4bd9246214e7eb086a6a9a30e27e3da55e4e0e08 Mon Sep 17 00:00:00 2001
From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com>
Date: Wed, 7 Jul 2021 00:26:16 +0200
Subject: [PATCH 02/13] more reverting

---
 src/fw_algorithms.jl | 1034 +++++++++++++++++++++---------------------
 1 file changed, 517 insertions(+), 517 deletions(-)

diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index 7ad67243f..fa1fa262c 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -1,6 +1,6 @@
 
 """
-frank_wolfe(f, grad!, lmo, x0; ...)
+    frank_wolfe(f, grad!, lmo, x0; ...)
 
 Simplest form of the Frank-Wolfe algorithm.
 Returns a tuple `(x, v, primal, dual_gap, traj_data)` with:
@@ -11,179 +11,205 @@ Returns a tuple `(x, v, primal, dual_gap, traj_data)` with:
 - `traj_data` vector of trajectory information.
 """
 function frank_wolfe(
-f,
-grad!,
-lmo,
-x0;
-line_search::LineSearchMethod=Adaptive(),
-L=Inf,
-gamma0=0,
-step_lim=20,
-momentum=nothing,
-epsilon=1e-7,
-max_iteration=10000,
-print_iter=1000,
-trajectory=false,
-verbose=false,
-linesearch_tol=1e-7,
-emphasis::Emphasis=memory,
-gradient=nothing,
-callback=nothing,
-timeout=Inf,
-print_callback=FrankWolfe.print_callback,
+    f,
+    grad!,
+    lmo,
+    x0;
+    line_search::LineSearchMethod=Adaptive(),
+    L=Inf,
+    gamma0=0,
+    step_lim=20,
+    momentum=nothing,
+    epsilon=1e-7,
+    max_iteration=10000,
+    print_iter=1000,
+    trajectory=false,
+    verbose=false,
+    linesearch_tol=1e-7,
+    emphasis::Emphasis=memory,
+    gradient=nothing,
+    callback=nothing,
+    timeout=Inf,
+    print_callback=FrankWolfe.print_callback,
 )
 
-# format string for output of the algorithm
-format_string = "%6s %13s %14e %14e %14e %14e %14e\n"
-t = 0
-dual_gap = Inf
-primal = Inf
-v = []
-x = x0
-tt = regular
-traj_data = []
-if trajectory && callback === nothing
-    callback = trajectory_callback(traj_data)
-end
-time_start = time_ns()
+    # format string for output of the algorithm
+    format_string = "%6s %13s %14e %14e %14e %14e %14e\n"
+    t = 0
+    dual_gap = Inf
+    primal = Inf
+    v = []
+    x = x0
+    tt = regular
+    traj_data = []
+    if trajectory && callback === nothing
+        callback = trajectory_callback(traj_data)
+    end
+    time_start = time_ns()
 
-if line_search isa Shortstep && !isfinite(L)
-    println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.")
-end
+    if line_search isa Shortstep && !isfinite(L)
+        println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.")
+    end
 
-if line_search isa FixedStep && gamma0 == 0
-    println("FATAL: gamma0 not set. We are not going to move a single bit.")
-end
+    if line_search isa FixedStep && gamma0 == 0
+        println("FATAL: gamma0 not set. We are not going to move a single bit.")
+    end
 
-if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep})
-    println(
-        "WARNING: Momentum-averaged gradients should usually be used with agnostic stepsize rules.",
-    )
-end
+    if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep})
+        println(
+            "WARNING: Momentum-averaged gradients should usually be used with agnostic stepsize rules.",
+        )
+    end
 
-if verbose
-    println("\nVanilla Frank-Wolfe Algorithm.")
-    numType = eltype(x0)
-    println(
-        "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $numType",
-    )
-    grad_type = typeof(gradient)
-    println("MOMENTUM: $momentum GRADIENTTYPE: $grad_type")
-    if emphasis === memory
-        println("WARNING: In memory emphasis mode iterates are written back into x0!")
+    if verbose
+        println("\nVanilla Frank-Wolfe Algorithm.")
+        numType = eltype(x0)
+        println(
+            "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $numType",
+        )
+        grad_type = typeof(gradient)
+        println("MOMENTUM: $momentum GRADIENTTYPE: $grad_type")
+        if emphasis === memory
+            println("WARNING: In memory emphasis mode iterates are written back into x0!")
+        end
+        headers = ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec"]
+        print_callback(headers, format_string, print_header=true)
     end
-    headers = ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec"]
-    print_callback(headers, format_string, print_header=true)
-end
 
-if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray})
-    # if integer, convert element type to most appropriate float
-    if eltype(x) <: Integer
-        x = convert(Array{float(eltype(x))}, x)
-    else
-        x = convert(Array{eltype(x)}, x)
+    if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray})
+        # if integer, convert element type to most appropriate float
+        if eltype(x) <: Integer
+            x = convert(Array{float(eltype(x))}, x)
+        else
+            x = convert(Array{eltype(x)}, x)
+        end
+    end
+    first_iter = true
+    # instanciating container for gradient
+    if gradient === nothing
+        gradient = similar(x)
     end
-end
-first_iter = true
-# instanciating container for gradient
-if gradient === nothing
-    gradient = similar(x)
-end
 
-# container for direction
-d = similar(x)
-gtemp = if momentum === nothing
-    nothing
-else
-    similar(x)
-end
-while t <= max_iteration && dual_gap >= max(epsilon, eps())
-
-    #####################
-    # managing time and Ctrl-C
-    #####################
-    time_at_loop = time_ns()
-    if t == 0
-        time_start = time_at_loop
+    # container for direction
+    d = similar(x)
+    gtemp = if momentum === nothing
+        nothing
+    else
+        similar(x)
     end
-    # time is measured at beginning of loop for consistency throughout all algorithms
-    tot_time = (time_at_loop - time_start) / 1e9
+    while t <= max_iteration && dual_gap >= max(epsilon, eps())
 
-    if timeout < Inf
-        if tot_time ≥ timeout
-            if verbose
-                @info "Time limit reached"
+        #####################
+        # managing time and Ctrl-C
+        #####################
+        time_at_loop = time_ns()
+        if t == 0
+            time_start = time_at_loop
+        end
+        # time is measured at beginning of loop for consistency throughout all algorithms
+        tot_time = (time_at_loop - time_start) / 1e9
+
+        if timeout < Inf
+            if tot_time ≥ timeout
+                if verbose
+                    @info "Time limit reached"
+                end
+                break
             end
-            break
         end
-    end
 
-    #####################
+        #####################
 
 
-    if momentum === nothing || first_iter
-        grad!(gradient, x)
-        if momentum !== nothing
-            gtemp .= gradient
+        if momentum === nothing || first_iter
+            grad!(gradient, x)
+            if momentum !== nothing
+                gtemp .= gradient
+            end
+        else
+            grad!(gtemp, x)
+            @emphasis(emphasis, gradient = (momentum * gradient) + (1 - momentum) * gtemp)
+        end
+        first_iter = false
+
+        v = compute_extreme_point(lmo, gradient)
+        # go easy on the memory - only compute if really needed
+        if (
+            (mod(t, print_iter) == 0 && verbose) ||
+            callback !== nothing ||
+            line_search isa Shortstep
+        )
+            primal = f(x)
+            dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
         end
-    else
-        grad!(gtemp, x)
-        @emphasis(emphasis, gradient = (momentum * gradient) + (1 - momentum) * gtemp)
-    end
-    first_iter = false
 
-    v = compute_extreme_point(lmo, gradient)
-    # go easy on the memory - only compute if really needed
-    if (
-        (mod(t, print_iter) == 0 && verbose) ||
-        callback !== nothing ||
-        line_search isa Shortstep
-    )
-        primal = f(x)
-        dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-    end
+        if callback !== nothing
+            state = (
+                t=t,
+                primal=primal,
+                dual=primal - dual_gap,
+                dual_gap=dual_gap,
+                time=tot_time,
+                x=x,
+                v=v,
+            )
+            callback(state)
+        end
+        @emphasis(emphasis, d = x - v)
 
-    if callback !== nothing
-        state = (
-            t=t,
-            primal=primal,
-            dual=primal - dual_gap,
-            dual_gap=dual_gap,
-            time=tot_time,
-            x=x,
-            v=v,
+        gamma, L = line_search_wrapper(
+            line_search,
+            t,
+            f,
+            grad!,
+            x,
+            d,
+            momentum === nothing ? gradient : gtemp, # use appropriate storage
+            dual_gap,
+            L,
+            gamma0,
+            linesearch_tol,
+            step_lim,
+            one(eltype(x)),
         )
-        callback(state)
-    end
-    @emphasis(emphasis, d = x - v)
-
-    gamma, L = line_search_wrapper(
-        line_search,
-        t,
-        f,
-        grad!,
-        x,
-        d,
-        momentum === nothing ? gradient : gtemp, # use appropriate storage
-        dual_gap,
-        L,
-        gamma0,
-        linesearch_tol,
-        step_lim,
-        one(eltype(x)),
-    )
-
-    @emphasis(emphasis, x = x - gamma * d)
-
-    if (mod(t, print_iter) == 0 && verbose)
-        tt = regular
-        if t == 0
-            tt = initial
+
+        @emphasis(emphasis, x = x - gamma * d)
+
+        if (mod(t, print_iter) == 0 && verbose)
+            tt = regular
+            if t == 0
+                tt = initial
+            end
+
+            rep = (
+                st[Symbol(tt)],
+                string(t),
+                Float64(primal),
+                Float64(primal - dual_gap),
+                Float64(dual_gap),
+                tot_time,
+                t / tot_time,
+            )
+            print_callback(rep, format_string)
+
+            flush(stdout)
         end
+        t = t + 1
+    end
+    # recompute everything once for final verfication / do not record to trajectory though for now!
+    # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
+    # hence the final computation.
 
+    grad!(gradient, x)
+    v = compute_extreme_point(lmo, gradient)
+    primal = f(x)
+    dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+    if verbose
+        tt = last
+        tot_time = (time_ns() - time_start) / 1.0e9
         rep = (
             st[Symbol(tt)],
-            string(t),
+            string(t - 1),
             Float64(primal),
             Float64(primal - dual_gap),
             Float64(dual_gap),
@@ -191,41 +217,15 @@ while t <= max_iteration && dual_gap >= max(epsilon, eps())
             t / tot_time,
         )
         print_callback(rep, format_string)
-
+        print_callback(nothing, format_string, print_footer=true)
         flush(stdout)
     end
-    t = t + 1
-end
-# recompute everything once for final verfication / do not record to trajectory though for now!
-# this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
-# hence the final computation.
-
-grad!(gradient, x)
-v = compute_extreme_point(lmo, gradient)
-primal = f(x)
-dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-if verbose
-    tt = last
-    tot_time = (time_ns() - time_start) / 1.0e9
-    rep = (
-        st[Symbol(tt)],
-        string(t - 1),
-        Float64(primal),
-        Float64(primal - dual_gap),
-        Float64(dual_gap),
-        tot_time,
-        t / tot_time,
-    )
-    print_callback(rep, format_string)
-    print_callback(nothing, format_string, print_footer=true)
-    flush(stdout)
-end
-return x, v, primal, dual_gap, traj_data
+    return x, v, primal, dual_gap, traj_data
 end
 
 
 """
-lazified_conditional_gradient
+    lazified_conditional_gradient
 
 Similar to [`frank_wolfe`](@ref) but lazyfying the LMO:
 each call is stored in a cache, which is looked up first for a good-enough direction.
@@ -233,170 +233,197 @@ The cache used is a [`FrankWolfe.MultiCacheLMO`](@ref) or a [`FrankWolfe.VectorC
 depending on whether the provided `cache_size` option is finite.
 """
 function lazified_conditional_gradient(
-f,
-grad!,
-lmo_base,
-x0;
-line_search::LineSearchMethod=Adaptive(),
-L=Inf,
-gamma0=0,
-K=2.0,
-cache_size=Inf,
-greedy_lazy=false,
-epsilon=1e-7,
-max_iteration=10000,
-print_iter=1000,
-trajectory=false,
-verbose=false,
-verbose_it=false,
-linesearch_tol=1e-7,
-step_lim=20,
-emphasis::Emphasis=memory,
-gradient=nothing,
-VType=typeof(x0),
-callback=nothing,
-timeout=Inf,
-print_callback=FrankWolfe.print_callback,
+    f,
+    grad!,
+    lmo_base,
+    x0;
+    line_search::LineSearchMethod=Adaptive(),
+    L=Inf,
+    gamma0=0,
+    K=2.0,
+    cache_size=Inf,
+    greedy_lazy=false,
+    epsilon=1e-7,
+    max_iteration=10000,
+    print_iter=1000,
+    trajectory=false,
+    verbose=false,
+    verbose_it=false,
+    linesearch_tol=1e-7,
+    step_lim=20,
+    emphasis::Emphasis=memory,
+    gradient=nothing,
+    VType=typeof(x0),
+    callback=nothing,
+    timeout=Inf,
+    print_callback=FrankWolfe.print_callback,
 )
 
-# format string for output of the algorithm
-format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n"
+    # format string for output of the algorithm
+    format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n"
 
-if isfinite(cache_size)
-    lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base)
-else
-    lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base)
-end
-
-t = 0
-dual_gap = Inf
-primal = Inf
-v = []
-x = x0
-phi = Inf
-traj_data = []
-if trajectory && callback === nothing
-    callback = trajectory_callback(traj_data)
-end
-tt = regular
-time_start = time_ns()
+    if isfinite(cache_size)
+        lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base)
+    else
+        lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base)
+    end
 
-if line_search isa Shortstep && !isfinite(L)
-    println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.")
-end
+    t = 0
+    dual_gap = Inf
+    primal = Inf
+    v = []
+    x = x0
+    phi = Inf
+    traj_data = []
+    if trajectory && callback === nothing
+        callback = trajectory_callback(traj_data)
+    end
+    tt = regular
+    time_start = time_ns()
 
-if line_search isa Agnostic || line_search isa Nonconvex
-    println("FATAL: Lazification is not known to converge with open-loop step size strategies.")
-end
+    if line_search isa Shortstep && !isfinite(L)
+        println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.")
+    end
 
-if verbose
-    println("\nLazified Conditional Gradients (Frank-Wolfe + Lazification).")
-    numType = eltype(x0)
-    println(
-        "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration K: $K TYPE: $numType",
-    )
-    grad_type = typeof(gradient)
-    println("GRADIENTTYPE: $grad_type CACHESIZE $cache_size GREEDYCACHE: $greedy_lazy")
-    if emphasis == memory
-        println("WARNING: In memory emphasis mode iterates are written back into x0!")
+    if line_search isa Agnostic || line_search isa Nonconvex
+        println("FATAL: Lazification is not known to converge with open-loop step size strategies.")
     end
-    headers =
-        ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec", "Cache Size"]
-    print_callback(headers, format_string, print_header=true)
-end
 
-if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray})
-    x = convert(Array{float(eltype(x))}, x)
-end
+    if verbose
+        println("\nLazified Conditional Gradients (Frank-Wolfe + Lazification).")
+        numType = eltype(x0)
+        println(
+            "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration K: $K TYPE: $numType",
+        )
+        grad_type = typeof(gradient)
+        println("GRADIENTTYPE: $grad_type CACHESIZE $cache_size GREEDYCACHE: $greedy_lazy")
+        if emphasis == memory
+            println("WARNING: In memory emphasis mode iterates are written back into x0!")
+        end
+        headers =
+            ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec", "Cache Size"]
+        print_callback(headers, format_string, print_header=true)
+    end
 
-if gradient === nothing
-    gradient = similar(x)
-end
+    if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray})
+        x = convert(Array{float(eltype(x))}, x)
+    end
 
-# container for direction
-d = similar(x)
+    if gradient === nothing
+        gradient = similar(x)
+    end
 
-while t <= max_iteration && dual_gap >= max(epsilon, eps())
+    # container for direction
+    d = similar(x)
 
-    #####################
-    # managing time and Ctrl-C
-    #####################
-    time_at_loop = time_ns()
-    if t == 0
-        time_start = time_at_loop
-    end
-    # time is measured at beginning of loop for consistency throughout all algorithms
-    tot_time = (time_at_loop - time_start) / 1e9
+    while t <= max_iteration && dual_gap >= max(epsilon, eps())
 
-    if timeout < Inf
-        if tot_time ≥ timeout
-            if verbose
-                @info "Time limit reached"
+        #####################
+        # managing time and Ctrl-C
+        #####################
+        time_at_loop = time_ns()
+        if t == 0
+            time_start = time_at_loop
+        end
+        # time is measured at beginning of loop for consistency throughout all algorithms
+        tot_time = (time_at_loop - time_start) / 1e9
+
+        if timeout < Inf
+            if tot_time ≥ timeout
+                if verbose
+                    @info "Time limit reached"
+                end
+                break
             end
-            break
         end
-    end
 
-    #####################
+        #####################
 
-    grad!(gradient, x)
+        grad!(gradient, x)
 
-    threshold = fast_dot(x, gradient) - phi / K
+        threshold = fast_dot(x, gradient) - phi / K
 
-    # go easy on the memory - only compute if really needed
-    if ((mod(t, print_iter) == 0 && (verbose || verbose_it) ) || callback !== nothing)
-        primal = f(x)
-    end
+        # go easy on the memory - only compute if really needed
+        if ((mod(t, print_iter) == 0 && (verbose || verbose_it) ) || callback !== nothing)
+            primal = f(x)
+        end
 
-    v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy)
-    tt = lazy
-    if fast_dot(v, gradient) > threshold
-        tt = dualstep
-        dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-        phi = min(dual_gap, phi / 2)
-    end
+        v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy)
+        tt = lazy
+        if fast_dot(v, gradient) > threshold
+            tt = dualstep
+            dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+            phi = min(dual_gap, phi / 2)
+        end
+
+        if callback !== nothing
+            state = (
+                t=t,
+                primal=primal,
+                dual=primal - dual_gap,
+                dual_gap=dual_gap,
+                time=tot_time,
+                cache_size=length(lmo),
+                x=x,
+                v=v,
+            )
+            callback(state)
+        end
+
+        @emphasis(emphasis, d = x - v)
 
-    if callback !== nothing
-        state = (
-            t=t,
-            primal=primal,
-            dual=primal - dual_gap,
-            dual_gap=dual_gap,
-            time=tot_time,
-            cache_size=length(lmo),
-            x=x,
-            v=v,
+        gamma, L = line_search_wrapper(
+            line_search,
+            t,
+            f,
+            grad!,
+            x,
+            d,
+            gradient,
+            dual_gap,
+            L,
+            gamma0,
+            linesearch_tol,
+            step_lim,
+            1.0,
         )
-        callback(state)
-    end
 
-    @emphasis(emphasis, d = x - v)
-
-    gamma, L = line_search_wrapper(
-        line_search,
-        t,
-        f,
-        grad!,
-        x,
-        d,
-        gradient,
-        dual_gap,
-        L,
-        gamma0,
-        linesearch_tol,
-        step_lim,
-        1.0,
-    )
-
-    @emphasis(emphasis, x = x - gamma * d)
-
-    if (verbose  || verbose_it) && (mod(t, print_iter) == 0 || tt == dualstep)
-        if t == 0
-            tt = initial
+        @emphasis(emphasis, x = x - gamma * d)
+
+        if (verbose  || verbose_it) && (mod(t, print_iter) == 0 || tt == dualstep)
+            if t == 0
+                tt = initial
+            end
+            rep = (
+                st[Symbol(tt)],
+                string(t),
+                Float64(primal),
+                Float64(primal - dual_gap),
+                Float64(dual_gap),
+                tot_time,
+                t / tot_time,
+                length(lmo),
+            )
+            print_callback(rep, format_string)
+            flush(stdout)
         end
+        t += 1
+    end
+
+    # recompute everything once for final verfication / do not record to trajectory though for now!
+    # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
+    # hence the final computation.
+    grad!(gradient, x)
+    v = compute_extreme_point(lmo, gradient)
+    primal = f(x)
+    dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+
+    if verbose  || verbose_it
+        tt = last
+        tot_time = (time_ns() - time_start) / 1.0e9
         rep = (
             st[Symbol(tt)],
-            string(t),
+            string(t - 1),
             Float64(primal),
             Float64(primal - dual_gap),
             Float64(dual_gap),
@@ -405,209 +432,208 @@ while t <= max_iteration && dual_gap >= max(epsilon, eps())
             length(lmo),
         )
         print_callback(rep, format_string)
+        if verbose
+            print_callback(nothing, format_string, print_footer=true)
+        end
         flush(stdout)
     end
-    t += 1
-end
-
-# recompute everything once for final verfication / do not record to trajectory though for now!
-# this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
-# hence the final computation.
-grad!(gradient, x)
-v = compute_extreme_point(lmo, gradient)
-primal = f(x)
-dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-
-if verbose  || verbose_it
-    tt = last
-    tot_time = (time_ns() - time_start) / 1.0e9
-    rep = (
-        st[Symbol(tt)],
-        string(t - 1),
-        Float64(primal),
-        Float64(primal - dual_gap),
-        Float64(dual_gap),
-        tot_time,
-        t / tot_time,
-        length(lmo),
-    )
-    print_callback(rep, format_string)
-    if verbose
-        print_callback(nothing, format_string, print_footer=true)
-    end
-    flush(stdout)
-end
-return x, v, primal, dual_gap, traj_data
+    return x, v, primal, dual_gap, traj_data
 end
 
 """
-stochastic_frank_wolfe(f::StochasticObjective, lmo, x0; ...)
+    stochastic_frank_wolfe(f::StochasticObjective, lmo, x0; ...)
 
 Stochastic version of Frank-Wolfe, evaluates the objective and gradient stochastically,
 implemented through the [FrankWolfe.StochasticObjective](@ref) interface.
 """
 function stochastic_frank_wolfe(
-f::StochasticObjective,
-lmo,
-x0;
-line_search::LineSearchMethod=nonconvex,
-L=Inf,
-gamma0=0,
-step_lim=20,
-momentum=nothing,
-epsilon=1e-7,
-max_iteration=10000,
-print_iter=1000,
-trajectory=false,
-verbose=false,
-linesearch_tol=1e-7,
-emphasis::Emphasis=blas,
-rng=Random.GLOBAL_RNG,
-batch_size=length(f.xs) ÷ 10 + 1,
-full_evaluation=false,
-callback=nothing,
-timeout=Inf,
-print_callback=FrankWolfe.print_callback,
+    f::StochasticObjective,
+    lmo,
+    x0;
+    line_search::LineSearchMethod=nonconvex,
+    L=Inf,
+    gamma0=0,
+    step_lim=20,
+    momentum=nothing,
+    epsilon=1e-7,
+    max_iteration=10000,
+    print_iter=1000,
+    trajectory=false,
+    verbose=false,
+    linesearch_tol=1e-7,
+    emphasis::Emphasis=blas,
+    rng=Random.GLOBAL_RNG,
+    batch_size=length(f.xs) ÷ 10 + 1,
+    full_evaluation=false,
+    callback=nothing,
+    timeout=Inf,
+    print_callback=FrankWolfe.print_callback,
 )
 
-# format string for output of the algorithm
-format_string = "%6s %13s %14e %14e %14e %14e %14e\n"
-
-t = 0
-dual_gap = Inf
-primal = Inf
-v = []
-x = x0
-tt = regular
-traj_data = []
-if trajectory && callback === nothing
-    callback = trajectory_callback(traj_data)
-end
-dx = similar(x0) # Array{eltype(x0)}(undef, length(x0))
-time_start = time_ns()
-
-if line_search == Shortstep && L == Inf
-    println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.")
-end
-
-if line_search == FixedStep && gamma0 == 0
-    println("FATAL: gamma0 not set. We are not going to move a single bit.")
-end
+    # format string for output of the algorithm
+    format_string = "%6s %13s %14e %14e %14e %14e %14e\n"
+
+    t = 0
+    dual_gap = Inf
+    primal = Inf
+    v = []
+    x = x0
+    tt = regular
+    traj_data = []
+    if trajectory && callback === nothing
+        callback = trajectory_callback(traj_data)
+    end
+    dx = similar(x0) # Array{eltype(x0)}(undef, length(x0))
+    time_start = time_ns()
 
-if verbose
-    println("\nStochastic Frank-Wolfe Algorithm.")
-    numType = eltype(x0)
-    println(
-        "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon max_iteration: $max_iteration TYPE: $numType",
-    )
-    # TODO: needs to fix
-    grad_type = typeof(nothing)
-    println("GRADIENTTYPE: $grad_type MOMENTUM: $momentum BATCHSIZE: $batch_size ")
-    if emphasis == memory
-        println("WARNING: In memory emphasis mode iterates are written back into x0!")
+    if line_search == Shortstep && L == Inf
+        println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.")
     end
-    headers = ("Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec")
-    print_callback(headers, format_string, print_header=true)
-end
 
-if emphasis == memory && !isa(x, Array)
-    x = convert(Array{promote_type(eltype(x), Float64)}, x)
-end
-first_iter = true
-gradient = 0
-while t <= max_iteration && dual_gap >= max(epsilon, eps())
-
-    #####################
-    # managing time and Ctrl-C
-    #####################
-    time_at_loop = time_ns()
-    if t == 0
-        time_start = time_at_loop
+    if line_search == FixedStep && gamma0 == 0
+        println("FATAL: gamma0 not set. We are not going to move a single bit.")
     end
-    # time is measured at beginning of loop for consistency throughout all algorithms
-    tot_time = (time_at_loop - time_start) / 1e9
 
-    if timeout < Inf
-        if tot_time ≥ timeout
-            if verbose
-                @info "Time limit reached"
-            end
-            break
+    if verbose
+        println("\nStochastic Frank-Wolfe Algorithm.")
+        numType = eltype(x0)
+        println(
+            "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon max_iteration: $max_iteration TYPE: $numType",
+        )
+        # TODO: needs to fix
+        grad_type = typeof(nothing)
+        println("GRADIENTTYPE: $grad_type MOMENTUM: $momentum BATCHSIZE: $batch_size ")
+        if emphasis == memory
+            println("WARNING: In memory emphasis mode iterates are written back into x0!")
         end
+        headers = ("Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec")
+        print_callback(headers, format_string, print_header=true)
     end
 
-    #####################
-
-    if momentum === nothing || first_iter
-        gradient = compute_gradient(
-            f,
-            x,
-            rng=rng,
-            batch_size=batch_size,
-            full_evaluation=full_evaluation,
-        )
-    else
-        @emphasis(
-            emphasis,
-            gradient =
-                (momentum * gradient) .+
-                (1 - momentum) * compute_gradient(
-                    f,
-                    x,
-                    rng=rng,
-                    batch_size=batch_size,
-                    full_evaluation=full_evaluation,
-                )
-        )
+    if emphasis == memory && !isa(x, Array)
+        x = convert(Array{promote_type(eltype(x), Float64)}, x)
     end
-    first_iter = false
+    first_iter = true
+    gradient = 0
+    while t <= max_iteration && dual_gap >= max(epsilon, eps())
+
+        #####################
+        # managing time and Ctrl-C
+        #####################
+        time_at_loop = time_ns()
+        if t == 0
+            time_start = time_at_loop
+        end
+        # time is measured at beginning of loop for consistency throughout all algorithms
+        tot_time = (time_at_loop - time_start) / 1e9
+
+        if timeout < Inf
+            if tot_time ≥ timeout
+                if verbose
+                    @info "Time limit reached"
+                end
+                break
+            end
+        end
 
-    v = compute_extreme_point(lmo, gradient)
+        #####################
+
+        if momentum === nothing || first_iter
+            gradient = compute_gradient(
+                f,
+                x,
+                rng=rng,
+                batch_size=batch_size,
+                full_evaluation=full_evaluation,
+            )
+        else
+            @emphasis(
+                emphasis,
+                gradient =
+                    (momentum * gradient) .+
+                    (1 - momentum) * compute_gradient(
+                        f,
+                        x,
+                        rng=rng,
+                        batch_size=batch_size,
+                        full_evaluation=full_evaluation,
+                    )
+            )
+        end
+        first_iter = false
 
-    # go easy on the memory - only compute if really needed
-    if (mod(t, print_iter) == 0 && verbose) ||
-       callback !== nothing ||
-       !(line_search isa Agnostic || line_search isa Nonconvex || line_search isa FixedStep)
-        primal = compute_value(f, x, full_evaluation=true)
-        dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-    end
+        v = compute_extreme_point(lmo, gradient)
 
-    if callback !== nothing
-        state = (
-            t=t,
-            primal=primal,
-            dual=primal - dual_gap,
-            dual_gap=dual_gap,
-            time=tot_time,
-            x=x,
-            v=v,
-        )
-        callback(state)
-    end
+        # go easy on the memory - only compute if really needed
+        if (mod(t, print_iter) == 0 && verbose) ||
+           callback !== nothing ||
+           !(line_search isa Agnostic || line_search isa Nonconvex || line_search isa FixedStep)
+            primal = compute_value(f, x, full_evaluation=true)
+            dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+        end
 
-    if line_search isa Agnostic
-        gamma = 2 // (2 + t)
-    elseif line_search isa Nonconvex
-        gamma = 1 / sqrt(t + 1)
-    elseif line_search isa Shortstep
-        gamma = dual_gap / (L * norm(x - v)^2)
-    elseif line_search isa RationalShortstep
-        rat_dual_gap = sum((x - v) .* gradient)
-        gamma = rat_dual_gap // (L * sum((x - v) .^ 2))
-    elseif line_search isa FixedStep
-        gamma = gamma0
-    end
+        if callback !== nothing
+            state = (
+                t=t,
+                primal=primal,
+                dual=primal - dual_gap,
+                dual_gap=dual_gap,
+                time=tot_time,
+                x=x,
+                v=v,
+            )
+            callback(state)
+        end
+
+        if line_search isa Agnostic
+            gamma = 2 // (2 + t)
+        elseif line_search isa Nonconvex
+            gamma = 1 / sqrt(t + 1)
+        elseif line_search isa Shortstep
+            gamma = dual_gap / (L * norm(x - v)^2)
+        elseif line_search isa RationalShortstep
+            rat_dual_gap = sum((x - v) .* gradient)
+            gamma = rat_dual_gap // (L * sum((x - v) .^ 2))
+        elseif line_search isa FixedStep
+            gamma = gamma0
+        end
 
-    @emphasis(emphasis, x = (1 - gamma) * x + gamma * v)
+        @emphasis(emphasis, x = (1 - gamma) * x + gamma * v)
 
-    if mod(t, print_iter) == 0 && verbose
-        tt = regular
-        if t == 0
-            tt = initial
+        if mod(t, print_iter) == 0 && verbose
+            tt = regular
+            if t == 0
+                tt = initial
+            end
+            rep = (
+                st[Symbol(tt)],
+                string(t),
+                Float64(primal),
+                Float64(primal - dual_gap),
+                Float64(dual_gap),
+                tot_time,
+                t / tot_time,
+            )
+            print_callback(rep, format_string)
+            flush(stdout)
         end
+        t += 1
+    end
+    # recompute everything once for final verfication / no additional callback call
+    # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
+    # hence the final computation.
+    # last computation done with full evaluation for exact gradient
+
+    (primal, gradient) = compute_value_gradient(f, x, full_evaluation=true)
+    v = compute_extreme_point(lmo, gradient)
+    # @show (gradient, primal)
+    dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
+    if verbose
+        tt = last
+        tot_time = (time_ns() - time_start) / 1.0e9
         rep = (
             st[Symbol(tt)],
-            string(t),
+            string(t - 1),
             Float64(primal),
             Float64(primal - dual_gap),
             Float64(dual_gap),
@@ -615,34 +641,8 @@ while t <= max_iteration && dual_gap >= max(epsilon, eps())
             t / tot_time,
         )
         print_callback(rep, format_string)
+        print_callback(nothing, format_string, print_footer=true)
         flush(stdout)
     end
-    t += 1
-end
-# recompute everything once for final verfication / no additional callback call
-# this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting
-# hence the final computation.
-# last computation done with full evaluation for exact gradient
-
-(primal, gradient) = compute_value_gradient(f, x, full_evaluation=true)
-v = compute_extreme_point(lmo, gradient)
-# @show (gradient, primal)
-dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
-if verbose
-    tt = last
-    tot_time = (time_ns() - time_start) / 1.0e9
-    rep = (
-        st[Symbol(tt)],
-        string(t - 1),
-        Float64(primal),
-        Float64(primal - dual_gap),
-        Float64(dual_gap),
-        tot_time,
-        t / tot_time,
-    )
-    print_callback(rep, format_string)
-    print_callback(nothing, format_string, print_footer=true)
-    flush(stdout)
-end
-return x, v, primal, dual_gap, traj_data
+    return x, v, primal, dual_gap, traj_data
 end

From 9ebef342dd206153ed26081256110a7a39a6de84 Mon Sep 17 00:00:00 2001
From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com>
Date: Wed, 7 Jul 2021 00:28:31 +0200
Subject: [PATCH 03/13] and fixing

---
 src/fw_algorithms.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index c5faa0ddd..fa1fa262c 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -432,7 +432,9 @@ function lazified_conditional_gradient(
             length(lmo),
         )
         print_callback(rep, format_string)
-        print_callback(nothing, format_string, print_footer=true)
+        if verbose
+            print_callback(nothing, format_string, print_footer=true)
+        end
         flush(stdout)
     end
     return x, v, primal, dual_gap, traj_data

From 24b5b061204766239519d3edc60757e98275a30b Mon Sep 17 00:00:00 2001
From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com>
Date: Thu, 8 Jul 2021 08:26:03 +0200
Subject: [PATCH 04/13] added warmstarting to lcg

---
 examples/warmstart_lmo.jl | 62 +++++++++++++++++++++++++++++++++++++++
 src/fw_algorithms.jl      | 13 +++++---
 2 files changed, 71 insertions(+), 4 deletions(-)
 create mode 100644 examples/warmstart_lmo.jl

diff --git a/examples/warmstart_lmo.jl b/examples/warmstart_lmo.jl
new file mode 100644
index 000000000..cebb5369d
--- /dev/null
+++ b/examples/warmstart_lmo.jl
@@ -0,0 +1,62 @@
+include("activate.jl")
+
+using LinearAlgebra
+
+n = Int(1e4)
+k = 10000
+
+xpi = rand(n);
+total = sum(xpi);
+const xp = xpi # ./ total;
+
+f(x) = norm(x - xp)^2
+function grad!(storage, x)
+    @. storage = 2 * (x - xp)
+end
+
+# better for memory consumption as we do coordinate-wise ops
+
+function cf(x, xp)
+    return LinearAlgebra.norm(x .- xp)^2
+end
+
+function cgrad!(storage, x, xp)
+    return @. storage = 2 * (x - xp)
+end
+
+# lmo = FrankWolfe.ProbabilitySimplexOracle(1);
+
+lmo = FrankWolfe.KSparseLMO(100, 1.0)
+x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n));
+
+# arbitrary cache
+
+x0 = deepcopy(x00)
+
+@time x, v, primal, dual_gap, trajectory, lmo  = FrankWolfe.lazified_conditional_gradient(
+    x -> cf(x, xp),
+    (str, x) -> cgrad!(str, x, xp),
+    lmo,
+    x0,
+    max_iteration=k,
+    L=100,
+    line_search=FrankWolfe.Adaptive(),
+    print_iter=k / 10,
+    emphasis=FrankWolfe.memory,
+    verbose=true,
+);
+
+@time x, v, primal, dual_gap, trajectory, lmo  = FrankWolfe.lazified_conditional_gradient(
+    x -> cf(x, xp),
+    (str, x) -> cgrad!(str, x, xp),
+    lmo,
+    x0,
+    max_iteration=k,
+    L=100,
+    line_search=FrankWolfe.Adaptive(),
+    print_iter=k / 10,
+    emphasis=FrankWolfe.memory,
+    verbose=true,
+    warmstart_lmo = lmo
+);
+
diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index fa1fa262c..526d899b9 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -257,15 +257,20 @@ function lazified_conditional_gradient(
     callback=nothing,
     timeout=Inf,
     print_callback=FrankWolfe.print_callback,
+    warmstart_lmo=nothing,
 )
 
     # format string for output of the algorithm
     format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n"
 
-    if isfinite(cache_size)
-        lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base)
+    if warmstart_lmo === nothing
+        if isfinite(cache_size)
+            lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base)
+        else
+            lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base)
+        end
     else
-        lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base)
+        lmo = warmstart_lmo
     end
 
     t = 0
@@ -437,7 +442,7 @@ function lazified_conditional_gradient(
         end
         flush(stdout)
     end
-    return x, v, primal, dual_gap, traj_data
+    return x, v, primal, dual_gap, traj_data, lmo
 end
 
 """

From b68e828f5efadab82be0bee30bd0a23ccb32143a Mon Sep 17 00:00:00 2001
From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com>
Date: Fri, 16 Jul 2021 18:09:07 +0200
Subject: [PATCH 05/13] adding some tracing example for gamma

---
 .gitignore                |   1 +
 examples/tracing_gamma.jl | 115 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 examples/tracing_gamma.jl

diff --git a/.gitignore b/.gitignore
index 80eaef87b..89820ea8a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,4 @@ examples/precompile_fw_clean.jl
 examples/precompile_fw.jl
 cc/libloporacle.so
 cc/*
+examples/temp.jl
diff --git a/examples/tracing_gamma.jl b/examples/tracing_gamma.jl
new file mode 100644
index 000000000..4aba97989
--- /dev/null
+++ b/examples/tracing_gamma.jl
@@ -0,0 +1,115 @@
+using LinearAlgebra
+using FrankWolfe
+using ProgressMeter
+using Plots
+
+n = Int(1e5)
+k = 1000
+
+xpi = rand(n);
+total = sum(xpi);
+const xp = xpi ./ total;
+
+# better for memory consumption as we do coordinate-wise ops
+
+function cf(x, xp)
+    return LinearAlgebra.norm(x .- xp)^2
+end
+
+function cgrad!(storage, x, xp)
+    return @. storage = 2 * (x - xp)
+end
+
+lmo = FrankWolfe.ProbabilitySimplexOracle(1);
+x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n));
+
+FrankWolfe.benchmark_oracles(
+    x -> cf(x, xp),
+    (str, x) -> cgrad!(str, x, xp),
+    () -> randn(n),
+    lmo;
+    k=100,
+)
+
+
+function build_callback(storage)
+    return function callback(data)
+        return push!(storage, (Tuple(data)[1:5]...,data.gamma))
+    end
+end
+
+
+####### 2/(2+t) rule
+
+x0 = copy(x00)
+
+trajectory_ag = Vector{Tuple{Int64,Float64,Float64,Float64,Float64,Float64}}()
+callback = build_callback(trajectory_ag)
+
+@time x, v, primal, dual_gap = FrankWolfe.frank_wolfe(
+    x -> cf(x, xp),
+    (str, x) -> cgrad!(str, x, xp),
+    lmo,
+    x0,
+    max_iteration=k,
+    trajectory=true,
+    line_search=FrankWolfe.Agnostic(),
+    print_iter=k / 10,
+    callback=callback,
+    emphasis=FrankWolfe.memory,
+    verbose=true,
+);
+
+
+####### adaptive
+
+x0 = copy(x00)
+
+trajectory_ada = Vector{Tuple{Int64,Float64,Float64,Float64,Float64,Float64}}()
+callback = build_callback(trajectory_ada)
+
+@time x, v, primal, dual_gap = FrankWolfe.frank_wolfe(
+    x -> cf(x, xp),
+    (str, x) -> cgrad!(str, x, xp),
+    lmo,
+    x0,
+    max_iteration=k,
+    trajectory=true,
+    line_search=FrankWolfe.Adaptive(),
+    print_iter=k / 10,
+    callback=callback,
+    emphasis=FrankWolfe.memory,
+    verbose=true,
+);
+
+
+####### backtracking
+
+x0 = copy(x00)
+
+trajectory_ls = Vector{Tuple{Int64,Float64,Float64,Float64,Float64,Float64}}()
+callback = build_callback(trajectory_ls)
+
+@time x, v, primal, dual_gap = FrankWolfe.frank_wolfe(
+    x -> cf(x, xp),
+    (str, x) -> cgrad!(str, x, xp),
+    lmo,
+    x0,
+    max_iteration=k,
+    trajectory=true,
+    line_search=FrankWolfe.Shortstep(),
+    print_iter=k / 10,
+    L=2,
+    callback=callback,
+    emphasis=FrankWolfe.memory,
+    verbose=true,
+);
+
+x = [trajectory_ag[i][1]+1 for i in eachindex(trajectory_ag)]
+gamma_ag = [trajectory_ag[i][6] for i in eachindex(trajectory_ag)]
+gamma_ada = [trajectory_ada[i][6] for i in eachindex(trajectory_ada)]
+gamma_ls = [trajectory_ls[i][6] for i in eachindex(trajectory_ls)]
+
+Plots.plot(x,gamma_ag,label="gamma_ag", yaxis=:log, xaxis=:log)
+Plots.plot!(x,gamma_ada,label="gamma_ada", yaxis=:log, xaxis=:log)
+Plots.plot!(x,gamma_ls,label="gamma_ls", yaxis=:log, xaxis=:log)

From 659abc2744bc501490cbda1a9361b6efebde75f2 Mon Sep 17 00:00:00 2001
From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com>
Date: Fri, 16 Jul 2021 18:16:41 +0200
Subject: [PATCH 06/13] minor

---
 examples/tracing_gamma.jl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/tracing_gamma.jl b/examples/tracing_gamma.jl
index 4aba97989..d7a47bc38 100644
--- a/examples/tracing_gamma.jl
+++ b/examples/tracing_gamma.jl
@@ -20,7 +20,11 @@ function cgrad!(storage, x, xp)
     return @. storage = 2 * (x - xp)
 end
 
-lmo = FrankWolfe.ProbabilitySimplexOracle(1);
+# lmo = FrankWolfe.ProbabilitySimplexOracle(1.0);
+# lmo = FrankWolfe.UnitSimplexOracle(1.0);
+# lmo = FrankWolfe.UnitSimplexOracle(1.0);
+lmo = FrankWolfe.KSparseLMO(40, 1.0);
+
 x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n));
 
 FrankWolfe.benchmark_oracles(

From 8278c1a25bd4d8afbe61dc3f3922636234865070 Mon Sep 17 00:00:00 2001
From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com>
Date: Sat, 17 Jul 2021 00:19:42 +0200
Subject: [PATCH 07/13] more tests

---
 examples/tracing_gamma.jl | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/examples/tracing_gamma.jl b/examples/tracing_gamma.jl
index d7a47bc38..606a9448c 100644
--- a/examples/tracing_gamma.jl
+++ b/examples/tracing_gamma.jl
@@ -3,8 +3,9 @@ using FrankWolfe
 using ProgressMeter
 using Plots
 
-n = Int(1e5)
-k = 1000
+n = Int(1e2)
+k = Int(1e6)
+eps=1e-8
 
 xpi = rand(n);
 total = sum(xpi);
@@ -20,10 +21,11 @@ function cgrad!(storage, x, xp)
     return @. storage = 2 * (x - xp)
 end
 
-# lmo = FrankWolfe.ProbabilitySimplexOracle(1.0);
+lmo = FrankWolfe.ProbabilitySimplexOracle(1.0);
 # lmo = FrankWolfe.UnitSimplexOracle(1.0);
 # lmo = FrankWolfe.UnitSimplexOracle(1.0);
-lmo = FrankWolfe.KSparseLMO(40, 1.0);
+# lmo = FrankWolfe.KSparseLMO(40, 1.0);
+# lmo = FrankWolfe.LpNormLMO{2}(1.0)
 
 x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n));
 
@@ -57,6 +59,7 @@ callback = build_callback(trajectory_ag)
     x0,
     max_iteration=k,
     trajectory=true,
+    epsilon=eps,
     line_search=FrankWolfe.Agnostic(),
     print_iter=k / 10,
     callback=callback,
@@ -79,6 +82,7 @@ callback = build_callback(trajectory_ada)
     x0,
     max_iteration=k,
     trajectory=true,
+    epsilon=eps,
     line_search=FrankWolfe.Adaptive(),
     print_iter=k / 10,
     callback=callback,
@@ -101,6 +105,7 @@ callback = build_callback(trajectory_ls)
     x0,
     max_iteration=k,
     trajectory=true,
+    epsilon=eps,
     line_search=FrankWolfe.Shortstep(),
     print_iter=k / 10,
     L=2,
@@ -109,11 +114,13 @@ callback = build_callback(trajectory_ls)
     verbose=true,
 );
 
-x = [trajectory_ag[i][1]+1 for i in eachindex(trajectory_ag)]
+x_ag = [trajectory_ag[i][1]+1 for i in eachindex(trajectory_ag)]
 gamma_ag = [trajectory_ag[i][6] for i in eachindex(trajectory_ag)]
+x_ada = [trajectory_ada[i][1]+1 for i in eachindex(trajectory_ada)]
 gamma_ada = [trajectory_ada[i][6] for i in eachindex(trajectory_ada)]
+x_ls = [trajectory_ls[i][1]+1 for i in eachindex(trajectory_ls)]
 gamma_ls = [trajectory_ls[i][6] for i in eachindex(trajectory_ls)]
 
-Plots.plot(x,gamma_ag,label="gamma_ag", yaxis=:log, xaxis=:log)
-Plots.plot!(x,gamma_ada,label="gamma_ada", yaxis=:log, xaxis=:log)
-Plots.plot!(x,gamma_ls,label="gamma_ls", yaxis=:log, xaxis=:log)
+Plots.plot(x_ag,gamma_ag,label="gamma_ag", yaxis=:log, xaxis=:log)
+Plots.plot!(x_ada,gamma_ada,label="gamma_ada", yaxis=:log, xaxis=:log)
+Plots.plot!(x_ls,gamma_ls,label="gamma_ls", yaxis=:log, xaxis=:log)

From 1d33e3183e9c783e8b382b6ac6a10bdfabbb7456 Mon Sep 17 00:00:00 2001
From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com>
Date: Sat, 17 Jul 2021 00:21:17 +0200
Subject: [PATCH 08/13] temp

---
 examples/tracing_gamma.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tracing_gamma.jl b/examples/tracing_gamma.jl
index 606a9448c..ff9555375 100644
--- a/examples/tracing_gamma.jl
+++ b/examples/tracing_gamma.jl
@@ -4,7 +4,7 @@ using ProgressMeter
 using Plots
 
 n = Int(1e2)
-k = Int(1e6)
+k = Int(1e4)
 eps=1e-8
 
 xpi = rand(n);

From ebd459d18e26b7d0c125ab5101df59e8a8e5375a Mon Sep 17 00:00:00 2001
From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com>
Date: Thu, 22 Jul 2021 00:10:29 +0200
Subject: [PATCH 09/13] minor verbose

---
 src/fw_algorithms.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index c896c2a32..37f1b1b7c 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -55,7 +55,7 @@ function frank_wolfe(
         println("FATAL: gamma0 not set. We are not going to move a single bit.")
     end
 
-    if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep})
+    if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep} && verbose)
         println(
             "WARNING: Momentum-averaged gradients should usually be used with agnostic stepsize rules.",
         )

From d3ed5f9dbe38343bf7bbcc766b83cd1262542f88 Mon Sep 17 00:00:00 2001
From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com>
Date: Thu, 22 Jul 2021 01:45:25 +0200
Subject: [PATCH 10/13] added ada

---
 src/fw_algorithms.jl | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index 37f1b1b7c..90dbce7fd 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -30,6 +30,8 @@ function frank_wolfe(
     gradient=nothing,
     callback=nothing,
     timeout=Inf,
+    ada=false,
+    ada_eps=1e-8,
     print_callback=print_callback,
 )
 
@@ -96,6 +98,11 @@ function frank_wolfe(
     else
         similar(x)
     end
+
+    # H and H2 container
+    H = similar(gradient) * 0.0
+    H2 = similar(gradient) * 0.0
+
     while t <= max_iteration && dual_gap >= max(epsilon, eps())
 
         #####################
@@ -131,6 +138,14 @@ function frank_wolfe(
         end
         first_iter = false
 
+        # H2 = H2+est_grad_f_x**2
+        # H = eps+np.sqrt(H2)
+        if ada && momentum === nothing
+            H2 += gradient.^2
+            H = (ada_eps .+ (H2.^(0.5))).^(-1.0)
+            gradient = gradient .* H
+        end
+
         v = compute_extreme_point(lmo, gradient)
         # go easy on the memory - only compute if really needed
         if (

From c979e849d353fc062a2f36a3e3c81568dc6679b2 Mon Sep 17 00:00:00 2001
From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com>
Date: Thu, 22 Jul 2021 01:54:05 +0200
Subject: [PATCH 11/13] removed ada - bad performance

---
 src/fw_algorithms.jl | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index 90dbce7fd..4e187f49d 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -30,8 +30,6 @@ function frank_wolfe(
     gradient=nothing,
     callback=nothing,
     timeout=Inf,
-    ada=false,
-    ada_eps=1e-8,
     print_callback=print_callback,
 )
 
@@ -99,9 +97,9 @@ function frank_wolfe(
         similar(x)
     end
 
-    # H and H2 container
-    H = similar(gradient) * 0.0
-    H2 = similar(gradient) * 0.0
+    # # H and H2 container
+    # H = similar(gradient) * 0.0
+    # H2 = similar(gradient) * 0.0
 
     while t <= max_iteration && dual_gap >= max(epsilon, eps())
 
@@ -138,13 +136,13 @@ function frank_wolfe(
         end
         first_iter = false
 
-        # H2 = H2+est_grad_f_x**2
-        # H = eps+np.sqrt(H2)
-        if ada && momentum === nothing
-            H2 += gradient.^2
-            H = (ada_eps .+ (H2.^(0.5))).^(-1.0)
-            gradient = gradient .* H
-        end
+        # # H2 = H2+est_grad_f_x**2
+        # # H = eps+np.sqrt(H2)
+        # if ada && momentum === nothing
+        #     H2 += gradient.^2
+        #     H = (ada_eps .+ (H2.^(0.5))).^(-1.0)
+        #     gradient = gradient .* H
+        # end
 
         v = compute_extreme_point(lmo, gradient)
         # go easy on the memory - only compute if really needed

From 2636093c590f69dc9ed5903ff7a599349a1b4038 Mon Sep 17 00:00:00 2001
From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com>
Date: Thu, 22 Jul 2021 02:00:45 +0200
Subject: [PATCH 12/13] fix in tracing_gamma

---
 examples/tracing_gamma.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/tracing_gamma.jl b/examples/tracing_gamma.jl
index ff9555375..4080c63e6 100644
--- a/examples/tracing_gamma.jl
+++ b/examples/tracing_gamma.jl
@@ -1,5 +1,7 @@
+include("activate.jl")
+
 using LinearAlgebra
-using FrankWolfe
+# using FrankWolfe
 using ProgressMeter
 using Plots
 

From 6fad9659910f0bd3e6bc6381b8c1d0c799d29dea Mon Sep 17 00:00:00 2001
From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com>
Date: Thu, 22 Jul 2021 13:37:17 +0200
Subject: [PATCH 13/13] fix to ci

---
 examples/tracing_gamma.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tracing_gamma.jl b/examples/tracing_gamma.jl
index 4080c63e6..c79b59e8d 100644
--- a/examples/tracing_gamma.jl
+++ b/examples/tracing_gamma.jl
@@ -2,8 +2,8 @@ include("activate.jl")
 
 using LinearAlgebra
 # using FrankWolfe
-using ProgressMeter
-using Plots
+# using ProgressMeter
+# using Plots
 
 n = Int(1e2)
 k = Int(1e4)