From 6bf107c61a76da76049002edcca4088d3113a661 Mon Sep 17 00:00:00 2001 From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com> Date: Wed, 7 Jul 2021 00:17:45 +0200 Subject: [PATCH 01/13] fixing merge mess --- src/fw_algorithms.jl | 1033 +++++++++++++++++++++--------------------- 1 file changed, 517 insertions(+), 516 deletions(-) diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index 846ea3f5b..7ad67243f 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -1,6 +1,6 @@ """ - frank_wolfe(f, grad!, lmo, x0; ...) +frank_wolfe(f, grad!, lmo, x0; ...) Simplest form of the Frank-Wolfe algorithm. Returns a tuple `(x, v, primal, dual_gap, traj_data)` with: @@ -11,205 +11,179 @@ Returns a tuple `(x, v, primal, dual_gap, traj_data)` with: - `traj_data` vector of trajectory information. """ function frank_wolfe( - f, - grad!, - lmo, - x0; - line_search::LineSearchMethod=Adaptive(), - L=Inf, - gamma0=0, - step_lim=20, - momentum=nothing, - epsilon=1e-7, - max_iteration=10000, - print_iter=1000, - trajectory=false, - verbose=false, - linesearch_tol=1e-7, - emphasis::Emphasis=memory, - gradient=nothing, - callback=nothing, - timeout=Inf, - print_callback=FrankWolfe.print_callback, +f, +grad!, +lmo, +x0; +line_search::LineSearchMethod=Adaptive(), +L=Inf, +gamma0=0, +step_lim=20, +momentum=nothing, +epsilon=1e-7, +max_iteration=10000, +print_iter=1000, +trajectory=false, +verbose=false, +linesearch_tol=1e-7, +emphasis::Emphasis=memory, +gradient=nothing, +callback=nothing, +timeout=Inf, +print_callback=FrankWolfe.print_callback, ) - # format string for output of the algorithm - format_string = "%6s %13s %14e %14e %14e %14e %14e\n" - t = 0 - dual_gap = Inf - primal = Inf - v = [] - x = x0 - tt = regular - traj_data = [] - if trajectory && callback === nothing - callback = trajectory_callback(traj_data) - end - time_start = time_ns() +# format string for output of the algorithm +format_string = "%6s %13s %14e %14e %14e %14e %14e\n" +t = 0 +dual_gap = Inf +primal = Inf +v = [] +x = x0 +tt = regular +traj_data = [] +if trajectory && callback === nothing + callback = trajectory_callback(traj_data) +end +time_start = time_ns() - if line_search isa Shortstep && !isfinite(L) - println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.") - end +if line_search isa Shortstep && !isfinite(L) + println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.") +end - if line_search isa FixedStep && gamma0 == 0 - println("FATAL: gamma0 not set. We are not going to move a single bit.") - end +if line_search isa FixedStep && gamma0 == 0 + println("FATAL: gamma0 not set. We are not going to move a single bit.") +end - if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep}) - println( - "WARNING: Momentum-averaged gradients should usually be used with agnostic stepsize rules.", - ) - end +if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep}) + println( + "WARNING: Momentum-averaged gradients should usually be used with agnostic stepsize rules.", + ) +end - if verbose - println("\nVanilla Frank-Wolfe Algorithm.") - numType = eltype(x0) - println( - "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $numType", - ) - grad_type = typeof(gradient) - println("MOMENTUM: $momentum GRADIENTTYPE: $grad_type") - if emphasis === memory - println("WARNING: In memory emphasis mode iterates are written back into x0!") - end - headers = ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec"] - print_callback(headers, format_string, print_header=true) +if verbose + println("\nVanilla Frank-Wolfe Algorithm.") + numType = eltype(x0) + println( + "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $numType", + ) + grad_type = typeof(gradient) + println("MOMENTUM: $momentum GRADIENTTYPE: $grad_type") + if emphasis === memory + println("WARNING: In memory emphasis mode iterates are written back into x0!") end + headers = ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec"] + print_callback(headers, format_string, print_header=true) +end - if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray}) - # if integer, convert element type to most appropriate float - if eltype(x) <: Integer - x = convert(Array{float(eltype(x))}, x) - else - x = convert(Array{eltype(x)}, x) - end - end - first_iter = true - # instanciating container for gradient - if gradient === nothing - gradient = similar(x) +if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray}) + # if integer, convert element type to most appropriate float + if eltype(x) <: Integer + x = convert(Array{float(eltype(x))}, x) + else + x = convert(Array{eltype(x)}, x) end +end +first_iter = true +# instanciating container for gradient +if gradient === nothing + gradient = similar(x) +end - # container for direction - d = similar(x) - gtemp = if momentum === nothing - nothing - else - similar(x) +# container for direction +d = similar(x) +gtemp = if momentum === nothing + nothing +else + similar(x) +end +while t <= max_iteration && dual_gap >= max(epsilon, eps()) + + ##################### + # managing time and Ctrl-C + ##################### + time_at_loop = time_ns() + if t == 0 + time_start = time_at_loop end - while t <= max_iteration && dual_gap >= max(epsilon, eps()) + # time is measured at beginning of loop for consistency throughout all algorithms + tot_time = (time_at_loop - time_start) / 1e9 - ##################### - # managing time and Ctrl-C - ##################### - time_at_loop = time_ns() - if t == 0 - time_start = time_at_loop - end - # time is measured at beginning of loop for consistency throughout all algorithms - tot_time = (time_at_loop - time_start) / 1e9 - - if timeout < Inf - if tot_time ≥ timeout - if verbose - @info "Time limit reached" - end - break + if timeout < Inf + if tot_time ≥ timeout + if verbose + @info "Time limit reached" end + break end + end - ##################### + ##################### - if momentum === nothing || first_iter - grad!(gradient, x) - if momentum !== nothing - gtemp .= gradient - end - else - grad!(gtemp, x) - @emphasis(emphasis, gradient = (momentum * gradient) + (1 - momentum) * gtemp) - end - first_iter = false - - v = compute_extreme_point(lmo, gradient) - # go easy on the memory - only compute if really needed - if ( - (mod(t, print_iter) == 0 && verbose) || - callback !== nothing || - line_search isa Shortstep - ) - primal = f(x) - dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + if momentum === nothing || first_iter + grad!(gradient, x) + if momentum !== nothing + gtemp .= gradient end + else + grad!(gtemp, x) + @emphasis(emphasis, gradient = (momentum * gradient) + (1 - momentum) * gtemp) + end + first_iter = false - if callback !== nothing - state = ( - t=t, - primal=primal, - dual=primal - dual_gap, - dual_gap=dual_gap, - time=tot_time, - x=x, - v=v, - ) - callback(state) - end - @emphasis(emphasis, d = x - v) + v = compute_extreme_point(lmo, gradient) + # go easy on the memory - only compute if really needed + if ( + (mod(t, print_iter) == 0 && verbose) || + callback !== nothing || + line_search isa Shortstep + ) + primal = f(x) + dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + end - gamma, L = line_search_wrapper( - line_search, - t, - f, - grad!, - x, - d, - momentum === nothing ? gradient : gtemp, # use appropriate storage - dual_gap, - L, - gamma0, - linesearch_tol, - step_lim, - one(eltype(x)), + if callback !== nothing + state = ( + t=t, + primal=primal, + dual=primal - dual_gap, + dual_gap=dual_gap, + time=tot_time, + x=x, + v=v, ) - - @emphasis(emphasis, x = x - gamma * d) - - if (mod(t, print_iter) == 0 && verbose) - tt = regular - if t == 0 - tt = initial - end - - rep = ( - st[Symbol(tt)], - string(t), - Float64(primal), - Float64(primal - dual_gap), - Float64(dual_gap), - tot_time, - t / tot_time, - ) - print_callback(rep, format_string) - - flush(stdout) - end - t = t + 1 + callback(state) end - # recompute everything once for final verfication / do not record to trajectory though for now! - # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting - # hence the final computation. + @emphasis(emphasis, d = x - v) + + gamma, L = line_search_wrapper( + line_search, + t, + f, + grad!, + x, + d, + momentum === nothing ? gradient : gtemp, # use appropriate storage + dual_gap, + L, + gamma0, + linesearch_tol, + step_lim, + one(eltype(x)), + ) + + @emphasis(emphasis, x = x - gamma * d) + + if (mod(t, print_iter) == 0 && verbose) + tt = regular + if t == 0 + tt = initial + end - grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) - primal = f(x) - dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - if verbose - tt = last - tot_time = (time_ns() - time_start) / 1.0e9 rep = ( st[Symbol(tt)], - string(t - 1), + string(t), Float64(primal), Float64(primal - dual_gap), Float64(dual_gap), @@ -217,15 +191,41 @@ function frank_wolfe( t / tot_time, ) print_callback(rep, format_string) - print_callback(nothing, format_string, print_footer=true) + flush(stdout) end - return x, v, primal, dual_gap, traj_data + t = t + 1 +end +# recompute everything once for final verfication / do not record to trajectory though for now! +# this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting +# hence the final computation. + +grad!(gradient, x) +v = compute_extreme_point(lmo, gradient) +primal = f(x) +dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) +if verbose + tt = last + tot_time = (time_ns() - time_start) / 1.0e9 + rep = ( + st[Symbol(tt)], + string(t - 1), + Float64(primal), + Float64(primal - dual_gap), + Float64(dual_gap), + tot_time, + t / tot_time, + ) + print_callback(rep, format_string) + print_callback(nothing, format_string, print_footer=true) + flush(stdout) +end +return x, v, primal, dual_gap, traj_data end """ - lazified_conditional_gradient +lazified_conditional_gradient Similar to [`frank_wolfe`](@ref) but lazyfying the LMO: each call is stored in a cache, which is looked up first for a good-enough direction. @@ -233,196 +233,170 @@ The cache used is a [`FrankWolfe.MultiCacheLMO`](@ref) or a [`FrankWolfe.VectorC depending on whether the provided `cache_size` option is finite. """ function lazified_conditional_gradient( - f, - grad!, - lmo_base, - x0; - line_search::LineSearchMethod=Adaptive(), - L=Inf, - gamma0=0, - K=2.0, - cache_size=Inf, - greedy_lazy=false, - epsilon=1e-7, - max_iteration=10000, - print_iter=1000, - trajectory=false, - verbose=false, - linesearch_tol=1e-7, - step_lim=20, - emphasis::Emphasis=memory, - gradient=nothing, - VType=typeof(x0), - callback=nothing, - timeout=Inf, - print_callback=FrankWolfe.print_callback, +f, +grad!, +lmo_base, +x0; +line_search::LineSearchMethod=Adaptive(), +L=Inf, +gamma0=0, +K=2.0, +cache_size=Inf, +greedy_lazy=false, +epsilon=1e-7, +max_iteration=10000, +print_iter=1000, +trajectory=false, +verbose=false, +verbose_it=false, +linesearch_tol=1e-7, +step_lim=20, +emphasis::Emphasis=memory, +gradient=nothing, +VType=typeof(x0), +callback=nothing, +timeout=Inf, +print_callback=FrankWolfe.print_callback, ) - # format string for output of the algorithm - format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n" +# format string for output of the algorithm +format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n" - if isfinite(cache_size) - lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base) - else - lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base) - end +if isfinite(cache_size) + lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base) +else + lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base) +end - t = 0 - dual_gap = Inf - primal = Inf - v = [] - x = x0 - phi = Inf - traj_data = [] - if trajectory && callback === nothing - callback = trajectory_callback(traj_data) - end - tt = regular - time_start = time_ns() +t = 0 +dual_gap = Inf +primal = Inf +v = [] +x = x0 +phi = Inf +traj_data = [] +if trajectory && callback === nothing + callback = trajectory_callback(traj_data) +end +tt = regular +time_start = time_ns() - if line_search isa Shortstep && !isfinite(L) - println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.") - end +if line_search isa Shortstep && !isfinite(L) + println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.") +end - if line_search isa Agnostic || line_search isa Nonconvex - println("FATAL: Lazification is not known to converge with open-loop step size strategies.") - end +if line_search isa Agnostic || line_search isa Nonconvex + println("FATAL: Lazification is not known to converge with open-loop step size strategies.") +end - if verbose - println("\nLazified Conditional Gradients (Frank-Wolfe + Lazification).") - numType = eltype(x0) - println( - "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration K: $K TYPE: $numType", - ) - grad_type = typeof(gradient) - println("GRADIENTTYPE: $grad_type CACHESIZE $cache_size GREEDYCACHE: $greedy_lazy") - if emphasis == memory - println("WARNING: In memory emphasis mode iterates are written back into x0!") - end - headers = - ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec", "Cache Size"] - print_callback(headers, format_string, print_header=true) +if verbose + println("\nLazified Conditional Gradients (Frank-Wolfe + Lazification).") + numType = eltype(x0) + println( + "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration K: $K TYPE: $numType", + ) + grad_type = typeof(gradient) + println("GRADIENTTYPE: $grad_type CACHESIZE $cache_size GREEDYCACHE: $greedy_lazy") + if emphasis == memory + println("WARNING: In memory emphasis mode iterates are written back into x0!") end + headers = + ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec", "Cache Size"] + print_callback(headers, format_string, print_header=true) +end - if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray}) - x = convert(Array{float(eltype(x))}, x) - end +if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray}) + x = convert(Array{float(eltype(x))}, x) +end - if gradient === nothing - gradient = similar(x) - end +if gradient === nothing + gradient = similar(x) +end - # container for direction - d = similar(x) +# container for direction +d = similar(x) - while t <= max_iteration && dual_gap >= max(epsilon, eps()) +while t <= max_iteration && dual_gap >= max(epsilon, eps()) - ##################### - # managing time and Ctrl-C - ##################### - time_at_loop = time_ns() - if t == 0 - time_start = time_at_loop - end - # time is measured at beginning of loop for consistency throughout all algorithms - tot_time = (time_at_loop - time_start) / 1e9 - - if timeout < Inf - if tot_time ≥ timeout - if verbose - @info "Time limit reached" - end - break + ##################### + # managing time and Ctrl-C + ##################### + time_at_loop = time_ns() + if t == 0 + time_start = time_at_loop + end + # time is measured at beginning of loop for consistency throughout all algorithms + tot_time = (time_at_loop - time_start) / 1e9 + + if timeout < Inf + if tot_time ≥ timeout + if verbose + @info "Time limit reached" end + break end + end - ##################### - - grad!(gradient, x) - - threshold = fast_dot(x, gradient) - phi / K + ##################### - # go easy on the memory - only compute if really needed - if ((mod(t, print_iter) == 0 && verbose ) || callback !== nothing) - primal = f(x) - end + grad!(gradient, x) - v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy) - tt = lazy - if fast_dot(v, gradient) > threshold - tt = dualstep - dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - phi = min(dual_gap, phi / 2) - end + threshold = fast_dot(x, gradient) - phi / K - if callback !== nothing - state = ( - t=t, - primal=primal, - dual=primal - dual_gap, - dual_gap=dual_gap, - time=tot_time, - cache_size=length(lmo), - x=x, - v=v, - ) - callback(state) - end + # go easy on the memory - only compute if really needed + if ((mod(t, print_iter) == 0 && (verbose || verbose_it) ) || callback !== nothing) + primal = f(x) + end - @emphasis(emphasis, d = x - v) + v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy) + tt = lazy + if fast_dot(v, gradient) > threshold + tt = dualstep + dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + phi = min(dual_gap, phi / 2) + end - gamma, L = line_search_wrapper( - line_search, - t, - f, - grad!, - x, - d, - gradient, - dual_gap, - L, - gamma0, - linesearch_tol, - step_lim, - 1.0, + if callback !== nothing + state = ( + t=t, + primal=primal, + dual=primal - dual_gap, + dual_gap=dual_gap, + time=tot_time, + cache_size=length(lmo), + x=x, + v=v, ) - - @emphasis(emphasis, x = x - gamma * d) - - if verbose && (mod(t, print_iter) == 0 || tt == dualstep) - if t == 0 - tt = initial - end - rep = ( - st[Symbol(tt)], - string(t), - Float64(primal), - Float64(primal - dual_gap), - Float64(dual_gap), - tot_time, - t / tot_time, - length(lmo), - ) - print_callback(rep, format_string) - flush(stdout) - end - t += 1 + callback(state) end - # recompute everything once for final verfication / do not record to trajectory though for now! - # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting - # hence the final computation. - grad!(gradient, x) - v = compute_extreme_point(lmo, gradient) - primal = f(x) - dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - - if verbose - tt = last - tot_time = (time_ns() - time_start) / 1.0e9 + @emphasis(emphasis, d = x - v) + + gamma, L = line_search_wrapper( + line_search, + t, + f, + grad!, + x, + d, + gradient, + dual_gap, + L, + gamma0, + linesearch_tol, + step_lim, + 1.0, + ) + + @emphasis(emphasis, x = x - gamma * d) + + if (verbose || verbose_it) && (mod(t, print_iter) == 0 || tt == dualstep) + if t == 0 + tt = initial + end rep = ( st[Symbol(tt)], - string(t - 1), + string(t), Float64(primal), Float64(primal - dual_gap), Float64(dual_gap), @@ -431,208 +405,209 @@ function lazified_conditional_gradient( length(lmo), ) print_callback(rep, format_string) - if verbose - print_callback(nothing, format_string, print_footer=true) - end flush(stdout) end - return x, v, primal, dual_gap, traj_data + t += 1 +end + +# recompute everything once for final verfication / do not record to trajectory though for now! +# this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting +# hence the final computation. +grad!(gradient, x) +v = compute_extreme_point(lmo, gradient) +primal = f(x) +dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + +if verbose || verbose_it + tt = last + tot_time = (time_ns() - time_start) / 1.0e9 + rep = ( + st[Symbol(tt)], + string(t - 1), + Float64(primal), + Float64(primal - dual_gap), + Float64(dual_gap), + tot_time, + t / tot_time, + length(lmo), + ) + print_callback(rep, format_string) + if verbose + print_callback(nothing, format_string, print_footer=true) + end + flush(stdout) +end +return x, v, primal, dual_gap, traj_data end """ - stochastic_frank_wolfe(f::StochasticObjective, lmo, x0; ...) +stochastic_frank_wolfe(f::StochasticObjective, lmo, x0; ...) Stochastic version of Frank-Wolfe, evaluates the objective and gradient stochastically, implemented through the [FrankWolfe.StochasticObjective](@ref) interface. """ function stochastic_frank_wolfe( - f::StochasticObjective, - lmo, - x0; - line_search::LineSearchMethod=nonconvex, - L=Inf, - gamma0=0, - step_lim=20, - momentum=nothing, - epsilon=1e-7, - max_iteration=10000, - print_iter=1000, - trajectory=false, - verbose=false, - linesearch_tol=1e-7, - emphasis::Emphasis=blas, - rng=Random.GLOBAL_RNG, - batch_size=length(f.xs) ÷ 10 + 1, - full_evaluation=false, - callback=nothing, - timeout=Inf, - print_callback=FrankWolfe.print_callback, +f::StochasticObjective, +lmo, +x0; +line_search::LineSearchMethod=nonconvex, +L=Inf, +gamma0=0, +step_lim=20, +momentum=nothing, +epsilon=1e-7, +max_iteration=10000, +print_iter=1000, +trajectory=false, +verbose=false, +linesearch_tol=1e-7, +emphasis::Emphasis=blas, +rng=Random.GLOBAL_RNG, +batch_size=length(f.xs) ÷ 10 + 1, +full_evaluation=false, +callback=nothing, +timeout=Inf, +print_callback=FrankWolfe.print_callback, ) - # format string for output of the algorithm - format_string = "%6s %13s %14e %14e %14e %14e %14e\n" - - t = 0 - dual_gap = Inf - primal = Inf - v = [] - x = x0 - tt = regular - traj_data = [] - if trajectory && callback === nothing - callback = trajectory_callback(traj_data) - end - dx = similar(x0) # Array{eltype(x0)}(undef, length(x0)) - time_start = time_ns() +# format string for output of the algorithm +format_string = "%6s %13s %14e %14e %14e %14e %14e\n" + +t = 0 +dual_gap = Inf +primal = Inf +v = [] +x = x0 +tt = regular +traj_data = [] +if trajectory && callback === nothing + callback = trajectory_callback(traj_data) +end +dx = similar(x0) # Array{eltype(x0)}(undef, length(x0)) +time_start = time_ns() - if line_search == Shortstep && L == Inf - println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.") - end +if line_search == Shortstep && L == Inf + println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.") +end - if line_search == FixedStep && gamma0 == 0 - println("FATAL: gamma0 not set. We are not going to move a single bit.") - end +if line_search == FixedStep && gamma0 == 0 + println("FATAL: gamma0 not set. We are not going to move a single bit.") +end - if verbose - println("\nStochastic Frank-Wolfe Algorithm.") - numType = eltype(x0) - println( - "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon max_iteration: $max_iteration TYPE: $numType", - ) - # TODO: needs to fix - grad_type = typeof(nothing) - println("GRADIENTTYPE: $grad_type MOMENTUM: $momentum BATCHSIZE: $batch_size ") - if emphasis == memory - println("WARNING: In memory emphasis mode iterates are written back into x0!") - end - headers = ("Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec") - print_callback(headers, format_string, print_header=true) +if verbose + println("\nStochastic Frank-Wolfe Algorithm.") + numType = eltype(x0) + println( + "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon max_iteration: $max_iteration TYPE: $numType", + ) + # TODO: needs to fix + grad_type = typeof(nothing) + println("GRADIENTTYPE: $grad_type MOMENTUM: $momentum BATCHSIZE: $batch_size ") + if emphasis == memory + println("WARNING: In memory emphasis mode iterates are written back into x0!") end + headers = ("Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec") + print_callback(headers, format_string, print_header=true) +end - if emphasis == memory && !isa(x, Array) - x = convert(Array{promote_type(eltype(x), Float64)}, x) +if emphasis == memory && !isa(x, Array) + x = convert(Array{promote_type(eltype(x), Float64)}, x) +end +first_iter = true +gradient = 0 +while t <= max_iteration && dual_gap >= max(epsilon, eps()) + + ##################### + # managing time and Ctrl-C + ##################### + time_at_loop = time_ns() + if t == 0 + time_start = time_at_loop end - first_iter = true - gradient = 0 - while t <= max_iteration && dual_gap >= max(epsilon, eps()) - - ##################### - # managing time and Ctrl-C - ##################### - time_at_loop = time_ns() - if t == 0 - time_start = time_at_loop - end - # time is measured at beginning of loop for consistency throughout all algorithms - tot_time = (time_at_loop - time_start) / 1e9 - - if timeout < Inf - if tot_time ≥ timeout - if verbose - @info "Time limit reached" - end - break - end - end + # time is measured at beginning of loop for consistency throughout all algorithms + tot_time = (time_at_loop - time_start) / 1e9 - ##################### - - if momentum === nothing || first_iter - gradient = compute_gradient( - f, - x, - rng=rng, - batch_size=batch_size, - full_evaluation=full_evaluation, - ) - else - @emphasis( - emphasis, - gradient = - (momentum * gradient) .+ - (1 - momentum) * compute_gradient( - f, - x, - rng=rng, - batch_size=batch_size, - full_evaluation=full_evaluation, - ) - ) + if timeout < Inf + if tot_time ≥ timeout + if verbose + @info "Time limit reached" + end + break end - first_iter = false + end - v = compute_extreme_point(lmo, gradient) + ##################### - # go easy on the memory - only compute if really needed - if (mod(t, print_iter) == 0 && verbose) || - callback !== nothing || - !(line_search isa Agnostic || line_search isa Nonconvex || line_search isa FixedStep) - primal = compute_value(f, x, full_evaluation=true) - dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - end + if momentum === nothing || first_iter + gradient = compute_gradient( + f, + x, + rng=rng, + batch_size=batch_size, + full_evaluation=full_evaluation, + ) + else + @emphasis( + emphasis, + gradient = + (momentum * gradient) .+ + (1 - momentum) * compute_gradient( + f, + x, + rng=rng, + batch_size=batch_size, + full_evaluation=full_evaluation, + ) + ) + end + first_iter = false - if callback !== nothing - state = ( - t=t, - primal=primal, - dual=primal - dual_gap, - dual_gap=dual_gap, - time=tot_time, - x=x, - v=v, - ) - callback(state) - end + v = compute_extreme_point(lmo, gradient) - if line_search isa Agnostic - gamma = 2 // (2 + t) - elseif line_search isa Nonconvex - gamma = 1 / sqrt(t + 1) - elseif line_search isa Shortstep - gamma = dual_gap / (L * norm(x - v)^2) - elseif line_search isa RationalShortstep - rat_dual_gap = sum((x - v) .* gradient) - gamma = rat_dual_gap // (L * sum((x - v) .^ 2)) - elseif line_search isa FixedStep - gamma = gamma0 - end + # go easy on the memory - only compute if really needed + if (mod(t, print_iter) == 0 && verbose) || + callback !== nothing || + !(line_search isa Agnostic || line_search isa Nonconvex || line_search isa FixedStep) + primal = compute_value(f, x, full_evaluation=true) + dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + end - @emphasis(emphasis, x = (1 - gamma) * x + gamma * v) + if callback !== nothing + state = ( + t=t, + primal=primal, + dual=primal - dual_gap, + dual_gap=dual_gap, + time=tot_time, + x=x, + v=v, + ) + callback(state) + end - if mod(t, print_iter) == 0 && verbose - tt = regular - if t == 0 - tt = initial - end - rep = ( - st[Symbol(tt)], - string(t), - Float64(primal), - Float64(primal - dual_gap), - Float64(dual_gap), - tot_time, - t / tot_time, - ) - print_callback(rep, format_string) - flush(stdout) - end - t += 1 + if line_search isa Agnostic + gamma = 2 // (2 + t) + elseif line_search isa Nonconvex + gamma = 1 / sqrt(t + 1) + elseif line_search isa Shortstep + gamma = dual_gap / (L * norm(x - v)^2) + elseif line_search isa RationalShortstep + rat_dual_gap = sum((x - v) .* gradient) + gamma = rat_dual_gap // (L * sum((x - v) .^ 2)) + elseif line_search isa FixedStep + gamma = gamma0 end - # recompute everything once for final verfication / no additional callback call - # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting - # hence the final computation. - # last computation done with full evaluation for exact gradient - (primal, gradient) = compute_value_gradient(f, x, full_evaluation=true) - v = compute_extreme_point(lmo, gradient) - # @show (gradient, primal) - dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - if verbose - tt = last - tot_time = (time_ns() - time_start) / 1.0e9 + @emphasis(emphasis, x = (1 - gamma) * x + gamma * v) + + if mod(t, print_iter) == 0 && verbose + tt = regular + if t == 0 + tt = initial + end rep = ( st[Symbol(tt)], - string(t - 1), + string(t), Float64(primal), Float64(primal - dual_gap), Float64(dual_gap), @@ -640,8 +615,34 @@ function stochastic_frank_wolfe( t / tot_time, ) print_callback(rep, format_string) - print_callback(nothing, format_string, print_footer=true) flush(stdout) end - return x, v, primal, dual_gap, traj_data + t += 1 +end +# recompute everything once for final verfication / no additional callback call +# this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting +# hence the final computation. +# last computation done with full evaluation for exact gradient + +(primal, gradient) = compute_value_gradient(f, x, full_evaluation=true) +v = compute_extreme_point(lmo, gradient) +# @show (gradient, primal) +dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) +if verbose + tt = last + tot_time = (time_ns() - time_start) / 1.0e9 + rep = ( + st[Symbol(tt)], + string(t - 1), + Float64(primal), + Float64(primal - dual_gap), + Float64(dual_gap), + tot_time, + t / tot_time, + ) + print_callback(rep, format_string) + print_callback(nothing, format_string, print_footer=true) + flush(stdout) +end +return x, v, primal, dual_gap, traj_data end From 4bd9246214e7eb086a6a9a30e27e3da55e4e0e08 Mon Sep 17 00:00:00 2001 From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com> Date: Wed, 7 Jul 2021 00:26:16 +0200 Subject: [PATCH 02/13] more reverting --- src/fw_algorithms.jl | 1034 +++++++++++++++++++++--------------------- 1 file changed, 517 insertions(+), 517 deletions(-) diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index 7ad67243f..fa1fa262c 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -1,6 +1,6 @@ """ -frank_wolfe(f, grad!, lmo, x0; ...) + frank_wolfe(f, grad!, lmo, x0; ...) Simplest form of the Frank-Wolfe algorithm. Returns a tuple `(x, v, primal, dual_gap, traj_data)` with: @@ -11,179 +11,205 @@ Returns a tuple `(x, v, primal, dual_gap, traj_data)` with: - `traj_data` vector of trajectory information. """ function frank_wolfe( -f, -grad!, -lmo, -x0; -line_search::LineSearchMethod=Adaptive(), -L=Inf, -gamma0=0, -step_lim=20, -momentum=nothing, -epsilon=1e-7, -max_iteration=10000, -print_iter=1000, -trajectory=false, -verbose=false, -linesearch_tol=1e-7, -emphasis::Emphasis=memory, -gradient=nothing, -callback=nothing, -timeout=Inf, -print_callback=FrankWolfe.print_callback, + f, + grad!, + lmo, + x0; + line_search::LineSearchMethod=Adaptive(), + L=Inf, + gamma0=0, + step_lim=20, + momentum=nothing, + epsilon=1e-7, + max_iteration=10000, + print_iter=1000, + trajectory=false, + verbose=false, + linesearch_tol=1e-7, + emphasis::Emphasis=memory, + gradient=nothing, + callback=nothing, + timeout=Inf, + print_callback=FrankWolfe.print_callback, ) -# format string for output of the algorithm -format_string = "%6s %13s %14e %14e %14e %14e %14e\n" -t = 0 -dual_gap = Inf -primal = Inf -v = [] -x = x0 -tt = regular -traj_data = [] -if trajectory && callback === nothing - callback = trajectory_callback(traj_data) -end -time_start = time_ns() + # format string for output of the algorithm + format_string = "%6s %13s %14e %14e %14e %14e %14e\n" + t = 0 + dual_gap = Inf + primal = Inf + v = [] + x = x0 + tt = regular + traj_data = [] + if trajectory && callback === nothing + callback = trajectory_callback(traj_data) + end + time_start = time_ns() -if line_search isa Shortstep && !isfinite(L) - println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.") -end + if line_search isa Shortstep && !isfinite(L) + println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.") + end -if line_search isa FixedStep && gamma0 == 0 - println("FATAL: gamma0 not set. We are not going to move a single bit.") -end + if line_search isa FixedStep && gamma0 == 0 + println("FATAL: gamma0 not set. We are not going to move a single bit.") + end -if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep}) - println( - "WARNING: Momentum-averaged gradients should usually be used with agnostic stepsize rules.", - ) -end + if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep}) + println( + "WARNING: Momentum-averaged gradients should usually be used with agnostic stepsize rules.", + ) + end -if verbose - println("\nVanilla Frank-Wolfe Algorithm.") - numType = eltype(x0) - println( - "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $numType", - ) - grad_type = typeof(gradient) - println("MOMENTUM: $momentum GRADIENTTYPE: $grad_type") - if emphasis === memory - println("WARNING: In memory emphasis mode iterates are written back into x0!") + if verbose + println("\nVanilla Frank-Wolfe Algorithm.") + numType = eltype(x0) + println( + "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $numType", + ) + grad_type = typeof(gradient) + println("MOMENTUM: $momentum GRADIENTTYPE: $grad_type") + if emphasis === memory + println("WARNING: In memory emphasis mode iterates are written back into x0!") + end + headers = ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec"] + print_callback(headers, format_string, print_header=true) end - headers = ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec"] - print_callback(headers, format_string, print_header=true) -end -if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray}) - # if integer, convert element type to most appropriate float - if eltype(x) <: Integer - x = convert(Array{float(eltype(x))}, x) - else - x = convert(Array{eltype(x)}, x) + if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray}) + # if integer, convert element type to most appropriate float + if eltype(x) <: Integer + x = convert(Array{float(eltype(x))}, x) + else + x = convert(Array{eltype(x)}, x) + end + end + first_iter = true + # instanciating container for gradient + if gradient === nothing + gradient = similar(x) end -end -first_iter = true -# instanciating container for gradient -if gradient === nothing - gradient = similar(x) -end -# container for direction -d = similar(x) -gtemp = if momentum === nothing - nothing -else - similar(x) -end -while t <= max_iteration && dual_gap >= max(epsilon, eps()) - - ##################### - # managing time and Ctrl-C - ##################### - time_at_loop = time_ns() - if t == 0 - time_start = time_at_loop + # container for direction + d = similar(x) + gtemp = if momentum === nothing + nothing + else + similar(x) end - # time is measured at beginning of loop for consistency throughout all algorithms - tot_time = (time_at_loop - time_start) / 1e9 + while t <= max_iteration && dual_gap >= max(epsilon, eps()) - if timeout < Inf - if tot_time ≥ timeout - if verbose - @info "Time limit reached" + ##################### + # managing time and Ctrl-C + ##################### + time_at_loop = time_ns() + if t == 0 + time_start = time_at_loop + end + # time is measured at beginning of loop for consistency throughout all algorithms + tot_time = (time_at_loop - time_start) / 1e9 + + if timeout < Inf + if tot_time ≥ timeout + if verbose + @info "Time limit reached" + end + break end - break end - end - ##################### + ##################### - if momentum === nothing || first_iter - grad!(gradient, x) - if momentum !== nothing - gtemp .= gradient + if momentum === nothing || first_iter + grad!(gradient, x) + if momentum !== nothing + gtemp .= gradient + end + else + grad!(gtemp, x) + @emphasis(emphasis, gradient = (momentum * gradient) + (1 - momentum) * gtemp) + end + first_iter = false + + v = compute_extreme_point(lmo, gradient) + # go easy on the memory - only compute if really needed + if ( + (mod(t, print_iter) == 0 && verbose) || + callback !== nothing || + line_search isa Shortstep + ) + primal = f(x) + dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) end - else - grad!(gtemp, x) - @emphasis(emphasis, gradient = (momentum * gradient) + (1 - momentum) * gtemp) - end - first_iter = false - v = compute_extreme_point(lmo, gradient) - # go easy on the memory - only compute if really needed - if ( - (mod(t, print_iter) == 0 && verbose) || - callback !== nothing || - line_search isa Shortstep - ) - primal = f(x) - dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - end + if callback !== nothing + state = ( + t=t, + primal=primal, + dual=primal - dual_gap, + dual_gap=dual_gap, + time=tot_time, + x=x, + v=v, + ) + callback(state) + end + @emphasis(emphasis, d = x - v) - if callback !== nothing - state = ( - t=t, - primal=primal, - dual=primal - dual_gap, - dual_gap=dual_gap, - time=tot_time, - x=x, - v=v, + gamma, L = line_search_wrapper( + line_search, + t, + f, + grad!, + x, + d, + momentum === nothing ? gradient : gtemp, # use appropriate storage + dual_gap, + L, + gamma0, + linesearch_tol, + step_lim, + one(eltype(x)), ) - callback(state) - end - @emphasis(emphasis, d = x - v) - - gamma, L = line_search_wrapper( - line_search, - t, - f, - grad!, - x, - d, - momentum === nothing ? gradient : gtemp, # use appropriate storage - dual_gap, - L, - gamma0, - linesearch_tol, - step_lim, - one(eltype(x)), - ) - - @emphasis(emphasis, x = x - gamma * d) - - if (mod(t, print_iter) == 0 && verbose) - tt = regular - if t == 0 - tt = initial + + @emphasis(emphasis, x = x - gamma * d) + + if (mod(t, print_iter) == 0 && verbose) + tt = regular + if t == 0 + tt = initial + end + + rep = ( + st[Symbol(tt)], + string(t), + Float64(primal), + Float64(primal - dual_gap), + Float64(dual_gap), + tot_time, + t / tot_time, + ) + print_callback(rep, format_string) + + flush(stdout) end + t = t + 1 + end + # recompute everything once for final verfication / do not record to trajectory though for now! + # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting + # hence the final computation. + grad!(gradient, x) + v = compute_extreme_point(lmo, gradient) + primal = f(x) + dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + if verbose + tt = last + tot_time = (time_ns() - time_start) / 1.0e9 rep = ( st[Symbol(tt)], - string(t), + string(t - 1), Float64(primal), Float64(primal - dual_gap), Float64(dual_gap), @@ -191,41 +217,15 @@ while t <= max_iteration && dual_gap >= max(epsilon, eps()) t / tot_time, ) print_callback(rep, format_string) - + print_callback(nothing, format_string, print_footer=true) flush(stdout) end - t = t + 1 -end -# recompute everything once for final verfication / do not record to trajectory though for now! -# this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting -# hence the final computation. - -grad!(gradient, x) -v = compute_extreme_point(lmo, gradient) -primal = f(x) -dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) -if verbose - tt = last - tot_time = (time_ns() - time_start) / 1.0e9 - rep = ( - st[Symbol(tt)], - string(t - 1), - Float64(primal), - Float64(primal - dual_gap), - Float64(dual_gap), - tot_time, - t / tot_time, - ) - print_callback(rep, format_string) - print_callback(nothing, format_string, print_footer=true) - flush(stdout) -end -return x, v, primal, dual_gap, traj_data + return x, v, primal, dual_gap, traj_data end """ -lazified_conditional_gradient + lazified_conditional_gradient Similar to [`frank_wolfe`](@ref) but lazyfying the LMO: each call is stored in a cache, which is looked up first for a good-enough direction. @@ -233,170 +233,197 @@ The cache used is a [`FrankWolfe.MultiCacheLMO`](@ref) or a [`FrankWolfe.VectorC depending on whether the provided `cache_size` option is finite. """ function lazified_conditional_gradient( -f, -grad!, -lmo_base, -x0; -line_search::LineSearchMethod=Adaptive(), -L=Inf, -gamma0=0, -K=2.0, -cache_size=Inf, -greedy_lazy=false, -epsilon=1e-7, -max_iteration=10000, -print_iter=1000, -trajectory=false, -verbose=false, -verbose_it=false, -linesearch_tol=1e-7, -step_lim=20, -emphasis::Emphasis=memory, -gradient=nothing, -VType=typeof(x0), -callback=nothing, -timeout=Inf, -print_callback=FrankWolfe.print_callback, + f, + grad!, + lmo_base, + x0; + line_search::LineSearchMethod=Adaptive(), + L=Inf, + gamma0=0, + K=2.0, + cache_size=Inf, + greedy_lazy=false, + epsilon=1e-7, + max_iteration=10000, + print_iter=1000, + trajectory=false, + verbose=false, + verbose_it=false, + linesearch_tol=1e-7, + step_lim=20, + emphasis::Emphasis=memory, + gradient=nothing, + VType=typeof(x0), + callback=nothing, + timeout=Inf, + print_callback=FrankWolfe.print_callback, ) -# format string for output of the algorithm -format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n" + # format string for output of the algorithm + format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n" -if isfinite(cache_size) - lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base) -else - lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base) -end - -t = 0 -dual_gap = Inf -primal = Inf -v = [] -x = x0 -phi = Inf -traj_data = [] -if trajectory && callback === nothing - callback = trajectory_callback(traj_data) -end -tt = regular -time_start = time_ns() + if isfinite(cache_size) + lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base) + else + lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base) + end -if line_search isa Shortstep && !isfinite(L) - println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.") -end + t = 0 + dual_gap = Inf + primal = Inf + v = [] + x = x0 + phi = Inf + traj_data = [] + if trajectory && callback === nothing + callback = trajectory_callback(traj_data) + end + tt = regular + time_start = time_ns() -if line_search isa Agnostic || line_search isa Nonconvex - println("FATAL: Lazification is not known to converge with open-loop step size strategies.") -end + if line_search isa Shortstep && !isfinite(L) + println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.") + end -if verbose - println("\nLazified Conditional Gradients (Frank-Wolfe + Lazification).") - numType = eltype(x0) - println( - "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration K: $K TYPE: $numType", - ) - grad_type = typeof(gradient) - println("GRADIENTTYPE: $grad_type CACHESIZE $cache_size GREEDYCACHE: $greedy_lazy") - if emphasis == memory - println("WARNING: In memory emphasis mode iterates are written back into x0!") + if line_search isa Agnostic || line_search isa Nonconvex + println("FATAL: Lazification is not known to converge with open-loop step size strategies.") end - headers = - ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec", "Cache Size"] - print_callback(headers, format_string, print_header=true) -end -if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray}) - x = convert(Array{float(eltype(x))}, x) -end + if verbose + println("\nLazified Conditional Gradients (Frank-Wolfe + Lazification).") + numType = eltype(x0) + println( + "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration K: $K TYPE: $numType", + ) + grad_type = typeof(gradient) + println("GRADIENTTYPE: $grad_type CACHESIZE $cache_size GREEDYCACHE: $greedy_lazy") + if emphasis == memory + println("WARNING: In memory emphasis mode iterates are written back into x0!") + end + headers = + ["Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec", "Cache Size"] + print_callback(headers, format_string, print_header=true) + end -if gradient === nothing - gradient = similar(x) -end + if emphasis == memory && !isa(x, Union{Array,SparseArrays.AbstractSparseArray}) + x = convert(Array{float(eltype(x))}, x) + end -# container for direction -d = similar(x) + if gradient === nothing + gradient = similar(x) + end -while t <= max_iteration && dual_gap >= max(epsilon, eps()) + # container for direction + d = similar(x) - ##################### - # managing time and Ctrl-C - ##################### - time_at_loop = time_ns() - if t == 0 - time_start = time_at_loop - end - # time is measured at beginning of loop for consistency throughout all algorithms - tot_time = (time_at_loop - time_start) / 1e9 + while t <= max_iteration && dual_gap >= max(epsilon, eps()) - if timeout < Inf - if tot_time ≥ timeout - if verbose - @info "Time limit reached" + ##################### + # managing time and Ctrl-C + ##################### + time_at_loop = time_ns() + if t == 0 + time_start = time_at_loop + end + # time is measured at beginning of loop for consistency throughout all algorithms + tot_time = (time_at_loop - time_start) / 1e9 + + if timeout < Inf + if tot_time ≥ timeout + if verbose + @info "Time limit reached" + end + break end - break end - end - ##################### + ##################### - grad!(gradient, x) + grad!(gradient, x) - threshold = fast_dot(x, gradient) - phi / K + threshold = fast_dot(x, gradient) - phi / K - # go easy on the memory - only compute if really needed - if ((mod(t, print_iter) == 0 && (verbose || verbose_it) ) || callback !== nothing) - primal = f(x) - end + # go easy on the memory - only compute if really needed + if ((mod(t, print_iter) == 0 && (verbose || verbose_it) ) || callback !== nothing) + primal = f(x) + end - v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy) - tt = lazy - if fast_dot(v, gradient) > threshold - tt = dualstep - dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - phi = min(dual_gap, phi / 2) - end + v = compute_extreme_point(lmo, gradient, threshold=threshold, greedy=greedy_lazy) + tt = lazy + if fast_dot(v, gradient) > threshold + tt = dualstep + dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + phi = min(dual_gap, phi / 2) + end + + if callback !== nothing + state = ( + t=t, + primal=primal, + dual=primal - dual_gap, + dual_gap=dual_gap, + time=tot_time, + cache_size=length(lmo), + x=x, + v=v, + ) + callback(state) + end + + @emphasis(emphasis, d = x - v) - if callback !== nothing - state = ( - t=t, - primal=primal, - dual=primal - dual_gap, - dual_gap=dual_gap, - time=tot_time, - cache_size=length(lmo), - x=x, - v=v, + gamma, L = line_search_wrapper( + line_search, + t, + f, + grad!, + x, + d, + gradient, + dual_gap, + L, + gamma0, + linesearch_tol, + step_lim, + 1.0, ) - callback(state) - end - @emphasis(emphasis, d = x - v) - - gamma, L = line_search_wrapper( - line_search, - t, - f, - grad!, - x, - d, - gradient, - dual_gap, - L, - gamma0, - linesearch_tol, - step_lim, - 1.0, - ) - - @emphasis(emphasis, x = x - gamma * d) - - if (verbose || verbose_it) && (mod(t, print_iter) == 0 || tt == dualstep) - if t == 0 - tt = initial + @emphasis(emphasis, x = x - gamma * d) + + if (verbose || verbose_it) && (mod(t, print_iter) == 0 || tt == dualstep) + if t == 0 + tt = initial + end + rep = ( + st[Symbol(tt)], + string(t), + Float64(primal), + Float64(primal - dual_gap), + Float64(dual_gap), + tot_time, + t / tot_time, + length(lmo), + ) + print_callback(rep, format_string) + flush(stdout) end + t += 1 + end + + # recompute everything once for final verfication / do not record to trajectory though for now! + # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting + # hence the final computation. + grad!(gradient, x) + v = compute_extreme_point(lmo, gradient) + primal = f(x) + dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + + if verbose || verbose_it + tt = last + tot_time = (time_ns() - time_start) / 1.0e9 rep = ( st[Symbol(tt)], - string(t), + string(t - 1), Float64(primal), Float64(primal - dual_gap), Float64(dual_gap), @@ -405,209 +432,208 @@ while t <= max_iteration && dual_gap >= max(epsilon, eps()) length(lmo), ) print_callback(rep, format_string) + if verbose + print_callback(nothing, format_string, print_footer=true) + end flush(stdout) end - t += 1 -end - -# recompute everything once for final verfication / do not record to trajectory though for now! -# this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting -# hence the final computation. -grad!(gradient, x) -v = compute_extreme_point(lmo, gradient) -primal = f(x) -dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - -if verbose || verbose_it - tt = last - tot_time = (time_ns() - time_start) / 1.0e9 - rep = ( - st[Symbol(tt)], - string(t - 1), - Float64(primal), - Float64(primal - dual_gap), - Float64(dual_gap), - tot_time, - t / tot_time, - length(lmo), - ) - print_callback(rep, format_string) - if verbose - print_callback(nothing, format_string, print_footer=true) - end - flush(stdout) -end -return x, v, primal, dual_gap, traj_data + return x, v, primal, dual_gap, traj_data end """ -stochastic_frank_wolfe(f::StochasticObjective, lmo, x0; ...) + stochastic_frank_wolfe(f::StochasticObjective, lmo, x0; ...) Stochastic version of Frank-Wolfe, evaluates the objective and gradient stochastically, implemented through the [FrankWolfe.StochasticObjective](@ref) interface. """ function stochastic_frank_wolfe( -f::StochasticObjective, -lmo, -x0; -line_search::LineSearchMethod=nonconvex, -L=Inf, -gamma0=0, -step_lim=20, -momentum=nothing, -epsilon=1e-7, -max_iteration=10000, -print_iter=1000, -trajectory=false, -verbose=false, -linesearch_tol=1e-7, -emphasis::Emphasis=blas, -rng=Random.GLOBAL_RNG, -batch_size=length(f.xs) ÷ 10 + 1, -full_evaluation=false, -callback=nothing, -timeout=Inf, -print_callback=FrankWolfe.print_callback, + f::StochasticObjective, + lmo, + x0; + line_search::LineSearchMethod=nonconvex, + L=Inf, + gamma0=0, + step_lim=20, + momentum=nothing, + epsilon=1e-7, + max_iteration=10000, + print_iter=1000, + trajectory=false, + verbose=false, + linesearch_tol=1e-7, + emphasis::Emphasis=blas, + rng=Random.GLOBAL_RNG, + batch_size=length(f.xs) ÷ 10 + 1, + full_evaluation=false, + callback=nothing, + timeout=Inf, + print_callback=FrankWolfe.print_callback, ) -# format string for output of the algorithm -format_string = "%6s %13s %14e %14e %14e %14e %14e\n" - -t = 0 -dual_gap = Inf -primal = Inf -v = [] -x = x0 -tt = regular -traj_data = [] -if trajectory && callback === nothing - callback = trajectory_callback(traj_data) -end -dx = similar(x0) # Array{eltype(x0)}(undef, length(x0)) -time_start = time_ns() - -if line_search == Shortstep && L == Inf - println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.") -end - -if line_search == FixedStep && gamma0 == 0 - println("FATAL: gamma0 not set. We are not going to move a single bit.") -end + # format string for output of the algorithm + format_string = "%6s %13s %14e %14e %14e %14e %14e\n" + + t = 0 + dual_gap = Inf + primal = Inf + v = [] + x = x0 + tt = regular + traj_data = [] + if trajectory && callback === nothing + callback = trajectory_callback(traj_data) + end + dx = similar(x0) # Array{eltype(x0)}(undef, length(x0)) + time_start = time_ns() -if verbose - println("\nStochastic Frank-Wolfe Algorithm.") - numType = eltype(x0) - println( - "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon max_iteration: $max_iteration TYPE: $numType", - ) - # TODO: needs to fix - grad_type = typeof(nothing) - println("GRADIENTTYPE: $grad_type MOMENTUM: $momentum BATCHSIZE: $batch_size ") - if emphasis == memory - println("WARNING: In memory emphasis mode iterates are written back into x0!") + if line_search == Shortstep && L == Inf + println("FATAL: Lipschitz constant not set. Prepare to blow up spectacularly.") end - headers = ("Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec") - print_callback(headers, format_string, print_header=true) -end -if emphasis == memory && !isa(x, Array) - x = convert(Array{promote_type(eltype(x), Float64)}, x) -end -first_iter = true -gradient = 0 -while t <= max_iteration && dual_gap >= max(epsilon, eps()) - - ##################### - # managing time and Ctrl-C - ##################### - time_at_loop = time_ns() - if t == 0 - time_start = time_at_loop + if line_search == FixedStep && gamma0 == 0 + println("FATAL: gamma0 not set. We are not going to move a single bit.") end - # time is measured at beginning of loop for consistency throughout all algorithms - tot_time = (time_at_loop - time_start) / 1e9 - if timeout < Inf - if tot_time ≥ timeout - if verbose - @info "Time limit reached" - end - break + if verbose + println("\nStochastic Frank-Wolfe Algorithm.") + numType = eltype(x0) + println( + "EMPHASIS: $emphasis STEPSIZE: $line_search EPSILON: $epsilon max_iteration: $max_iteration TYPE: $numType", + ) + # TODO: needs to fix + grad_type = typeof(nothing) + println("GRADIENTTYPE: $grad_type MOMENTUM: $momentum BATCHSIZE: $batch_size ") + if emphasis == memory + println("WARNING: In memory emphasis mode iterates are written back into x0!") end + headers = ("Type", "Iteration", "Primal", "Dual", "Dual Gap", "Time", "It/sec") + print_callback(headers, format_string, print_header=true) end - ##################### - - if momentum === nothing || first_iter - gradient = compute_gradient( - f, - x, - rng=rng, - batch_size=batch_size, - full_evaluation=full_evaluation, - ) - else - @emphasis( - emphasis, - gradient = - (momentum * gradient) .+ - (1 - momentum) * compute_gradient( - f, - x, - rng=rng, - batch_size=batch_size, - full_evaluation=full_evaluation, - ) - ) + if emphasis == memory && !isa(x, Array) + x = convert(Array{promote_type(eltype(x), Float64)}, x) end - first_iter = false + first_iter = true + gradient = 0 + while t <= max_iteration && dual_gap >= max(epsilon, eps()) + + ##################### + # managing time and Ctrl-C + ##################### + time_at_loop = time_ns() + if t == 0 + time_start = time_at_loop + end + # time is measured at beginning of loop for consistency throughout all algorithms + tot_time = (time_at_loop - time_start) / 1e9 + + if timeout < Inf + if tot_time ≥ timeout + if verbose + @info "Time limit reached" + end + break + end + end - v = compute_extreme_point(lmo, gradient) + ##################### + + if momentum === nothing || first_iter + gradient = compute_gradient( + f, + x, + rng=rng, + batch_size=batch_size, + full_evaluation=full_evaluation, + ) + else + @emphasis( + emphasis, + gradient = + (momentum * gradient) .+ + (1 - momentum) * compute_gradient( + f, + x, + rng=rng, + batch_size=batch_size, + full_evaluation=full_evaluation, + ) + ) + end + first_iter = false - # go easy on the memory - only compute if really needed - if (mod(t, print_iter) == 0 && verbose) || - callback !== nothing || - !(line_search isa Agnostic || line_search isa Nonconvex || line_search isa FixedStep) - primal = compute_value(f, x, full_evaluation=true) - dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - end + v = compute_extreme_point(lmo, gradient) - if callback !== nothing - state = ( - t=t, - primal=primal, - dual=primal - dual_gap, - dual_gap=dual_gap, - time=tot_time, - x=x, - v=v, - ) - callback(state) - end + # go easy on the memory - only compute if really needed + if (mod(t, print_iter) == 0 && verbose) || + callback !== nothing || + !(line_search isa Agnostic || line_search isa Nonconvex || line_search isa FixedStep) + primal = compute_value(f, x, full_evaluation=true) + dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + end - if line_search isa Agnostic - gamma = 2 // (2 + t) - elseif line_search isa Nonconvex - gamma = 1 / sqrt(t + 1) - elseif line_search isa Shortstep - gamma = dual_gap / (L * norm(x - v)^2) - elseif line_search isa RationalShortstep - rat_dual_gap = sum((x - v) .* gradient) - gamma = rat_dual_gap // (L * sum((x - v) .^ 2)) - elseif line_search isa FixedStep - gamma = gamma0 - end + if callback !== nothing + state = ( + t=t, + primal=primal, + dual=primal - dual_gap, + dual_gap=dual_gap, + time=tot_time, + x=x, + v=v, + ) + callback(state) + end + + if line_search isa Agnostic + gamma = 2 // (2 + t) + elseif line_search isa Nonconvex + gamma = 1 / sqrt(t + 1) + elseif line_search isa Shortstep + gamma = dual_gap / (L * norm(x - v)^2) + elseif line_search isa RationalShortstep + rat_dual_gap = sum((x - v) .* gradient) + gamma = rat_dual_gap // (L * sum((x - v) .^ 2)) + elseif line_search isa FixedStep + gamma = gamma0 + end - @emphasis(emphasis, x = (1 - gamma) * x + gamma * v) + @emphasis(emphasis, x = (1 - gamma) * x + gamma * v) - if mod(t, print_iter) == 0 && verbose - tt = regular - if t == 0 - tt = initial + if mod(t, print_iter) == 0 && verbose + tt = regular + if t == 0 + tt = initial + end + rep = ( + st[Symbol(tt)], + string(t), + Float64(primal), + Float64(primal - dual_gap), + Float64(dual_gap), + tot_time, + t / tot_time, + ) + print_callback(rep, format_string) + flush(stdout) end + t += 1 + end + # recompute everything once for final verfication / no additional callback call + # this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting + # hence the final computation. + # last computation done with full evaluation for exact gradient + + (primal, gradient) = compute_value_gradient(f, x, full_evaluation=true) + v = compute_extreme_point(lmo, gradient) + # @show (gradient, primal) + dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) + if verbose + tt = last + tot_time = (time_ns() - time_start) / 1.0e9 rep = ( st[Symbol(tt)], - string(t), + string(t - 1), Float64(primal), Float64(primal - dual_gap), Float64(dual_gap), @@ -615,34 +641,8 @@ while t <= max_iteration && dual_gap >= max(epsilon, eps()) t / tot_time, ) print_callback(rep, format_string) + print_callback(nothing, format_string, print_footer=true) flush(stdout) end - t += 1 -end -# recompute everything once for final verfication / no additional callback call -# this is important as some variants do not recompute f(x) and the dual_gap regularly but only when reporting -# hence the final computation. -# last computation done with full evaluation for exact gradient - -(primal, gradient) = compute_value_gradient(f, x, full_evaluation=true) -v = compute_extreme_point(lmo, gradient) -# @show (gradient, primal) -dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) -if verbose - tt = last - tot_time = (time_ns() - time_start) / 1.0e9 - rep = ( - st[Symbol(tt)], - string(t - 1), - Float64(primal), - Float64(primal - dual_gap), - Float64(dual_gap), - tot_time, - t / tot_time, - ) - print_callback(rep, format_string) - print_callback(nothing, format_string, print_footer=true) - flush(stdout) -end -return x, v, primal, dual_gap, traj_data + return x, v, primal, dual_gap, traj_data end From 9ebef342dd206153ed26081256110a7a39a6de84 Mon Sep 17 00:00:00 2001 From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com> Date: Wed, 7 Jul 2021 00:28:31 +0200 Subject: [PATCH 03/13] and fixing --- src/fw_algorithms.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index c5faa0ddd..fa1fa262c 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -432,7 +432,9 @@ function lazified_conditional_gradient( length(lmo), ) print_callback(rep, format_string) - print_callback(nothing, format_string, print_footer=true) + if verbose + print_callback(nothing, format_string, print_footer=true) + end flush(stdout) end return x, v, primal, dual_gap, traj_data From 24b5b061204766239519d3edc60757e98275a30b Mon Sep 17 00:00:00 2001 From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com> Date: Thu, 8 Jul 2021 08:26:03 +0200 Subject: [PATCH 04/13] added warmstarting to lcg --- examples/warmstart_lmo.jl | 62 +++++++++++++++++++++++++++++++++++++++ src/fw_algorithms.jl | 13 +++++--- 2 files changed, 71 insertions(+), 4 deletions(-) create mode 100644 examples/warmstart_lmo.jl diff --git a/examples/warmstart_lmo.jl b/examples/warmstart_lmo.jl new file mode 100644 index 000000000..cebb5369d --- /dev/null +++ b/examples/warmstart_lmo.jl @@ -0,0 +1,62 @@ +include("activate.jl") + +using LinearAlgebra + +n = Int(1e4) +k = 10000 + +xpi = rand(n); +total = sum(xpi); +const xp = xpi # ./ total; + +f(x) = norm(x - xp)^2 +function grad!(storage, x) + @. storage = 2 * (x - xp) +end + +# better for memory consumption as we do coordinate-wise ops + +function cf(x, xp) + return LinearAlgebra.norm(x .- xp)^2 +end + +function cgrad!(storage, x, xp) + return @. storage = 2 * (x - xp) +end + +# lmo = FrankWolfe.ProbabilitySimplexOracle(1); + +lmo = FrankWolfe.KSparseLMO(100, 1.0) +x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n)); + +# arbitrary cache + +x0 = deepcopy(x00) + +@time x, v, primal, dual_gap, trajectory, lmo = FrankWolfe.lazified_conditional_gradient( + x -> cf(x, xp), + (str, x) -> cgrad!(str, x, xp), + lmo, + x0, + max_iteration=k, + L=100, + line_search=FrankWolfe.Adaptive(), + print_iter=k / 10, + emphasis=FrankWolfe.memory, + verbose=true, +); + +@time x, v, primal, dual_gap, trajectory, lmo = FrankWolfe.lazified_conditional_gradient( + x -> cf(x, xp), + (str, x) -> cgrad!(str, x, xp), + lmo, + x0, + max_iteration=k, + L=100, + line_search=FrankWolfe.Adaptive(), + print_iter=k / 10, + emphasis=FrankWolfe.memory, + verbose=true, + warmstart_lmo = lmo +); + diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index fa1fa262c..526d899b9 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -257,15 +257,20 @@ function lazified_conditional_gradient( callback=nothing, timeout=Inf, print_callback=FrankWolfe.print_callback, + warmstart_lmo=nothing, ) # format string for output of the algorithm format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n" - if isfinite(cache_size) - lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base) + if warmstart_lmo === nothing + if isfinite(cache_size) + lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base) + else + lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base) + end else - lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base) + lmo = warmstart_lmo end t = 0 @@ -437,7 +442,7 @@ function lazified_conditional_gradient( end flush(stdout) end - return x, v, primal, dual_gap, traj_data + return x, v, primal, dual_gap, traj_data, lmo end """ From b68e828f5efadab82be0bee30bd0a23ccb32143a Mon Sep 17 00:00:00 2001 From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com> Date: Fri, 16 Jul 2021 18:09:07 +0200 Subject: [PATCH 05/13] adding some tracing example for gamma --- .gitignore | 1 + examples/tracing_gamma.jl | 115 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 examples/tracing_gamma.jl diff --git a/.gitignore b/.gitignore index 80eaef87b..89820ea8a 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ examples/precompile_fw_clean.jl examples/precompile_fw.jl cc/libloporacle.so cc/* +examples/temp.jl diff --git a/examples/tracing_gamma.jl b/examples/tracing_gamma.jl new file mode 100644 index 000000000..4aba97989 --- /dev/null +++ b/examples/tracing_gamma.jl @@ -0,0 +1,115 @@ +using LinearAlgebra +using FrankWolfe +using ProgressMeter +using Plots + +n = Int(1e5) +k = 1000 + +xpi = rand(n); +total = sum(xpi); +const xp = xpi ./ total; + +# better for memory consumption as we do coordinate-wise ops + +function cf(x, xp) + return LinearAlgebra.norm(x .- xp)^2 +end + +function cgrad!(storage, x, xp) + return @. storage = 2 * (x - xp) +end + +lmo = FrankWolfe.ProbabilitySimplexOracle(1); +x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n)); + +FrankWolfe.benchmark_oracles( + x -> cf(x, xp), + (str, x) -> cgrad!(str, x, xp), + () -> randn(n), + lmo; + k=100, +) + + +function build_callback(storage) + return function callback(data) + return push!(storage, (Tuple(data)[1:5]...,data.gamma)) + end +end + + +####### 2/(2+t) rule + +x0 = copy(x00) + +trajectory_ag = Vector{Tuple{Int64,Float64,Float64,Float64,Float64,Float64}}() +callback = build_callback(trajectory_ag) + +@time x, v, primal, dual_gap = FrankWolfe.frank_wolfe( + x -> cf(x, xp), + (str, x) -> cgrad!(str, x, xp), + lmo, + x0, + max_iteration=k, + trajectory=true, + line_search=FrankWolfe.Agnostic(), + print_iter=k / 10, + callback=callback, + emphasis=FrankWolfe.memory, + verbose=true, +); + + +####### adaptive + +x0 = copy(x00) + +trajectory_ada = Vector{Tuple{Int64,Float64,Float64,Float64,Float64,Float64}}() +callback = build_callback(trajectory_ada) + +@time x, v, primal, dual_gap = FrankWolfe.frank_wolfe( + x -> cf(x, xp), + (str, x) -> cgrad!(str, x, xp), + lmo, + x0, + max_iteration=k, + trajectory=true, + line_search=FrankWolfe.Adaptive(), + print_iter=k / 10, + callback=callback, + emphasis=FrankWolfe.memory, + verbose=true, +); + + +####### backtracking + +x0 = copy(x00) + +trajectory_ls = Vector{Tuple{Int64,Float64,Float64,Float64,Float64,Float64}}() +callback = build_callback(trajectory_ls) + +@time x, v, primal, dual_gap = FrankWolfe.frank_wolfe( + x -> cf(x, xp), + (str, x) -> cgrad!(str, x, xp), + lmo, + x0, + max_iteration=k, + trajectory=true, + line_search=FrankWolfe.Shortstep(), + print_iter=k / 10, + L=2, + callback=callback, + emphasis=FrankWolfe.memory, + verbose=true, +); + +x = [trajectory_ag[i][1]+1 for i in eachindex(trajectory_ag)] +gamma_ag = [trajectory_ag[i][6] for i in eachindex(trajectory_ag)] +gamma_ada = [trajectory_ada[i][6] for i in eachindex(trajectory_ada)] +gamma_ls = [trajectory_ls[i][6] for i in eachindex(trajectory_ls)] + +Plots.plot(x,gamma_ag,label="gamma_ag", yaxis=:log, xaxis=:log) +Plots.plot!(x,gamma_ada,label="gamma_ada", yaxis=:log, xaxis=:log) +Plots.plot!(x,gamma_ls,label="gamma_ls", yaxis=:log, xaxis=:log) From 659abc2744bc501490cbda1a9361b6efebde75f2 Mon Sep 17 00:00:00 2001 From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com> Date: Fri, 16 Jul 2021 18:16:41 +0200 Subject: [PATCH 06/13] minor --- examples/tracing_gamma.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/tracing_gamma.jl b/examples/tracing_gamma.jl index 4aba97989..d7a47bc38 100644 --- a/examples/tracing_gamma.jl +++ b/examples/tracing_gamma.jl @@ -20,7 +20,11 @@ function cgrad!(storage, x, xp) return @. storage = 2 * (x - xp) end -lmo = FrankWolfe.ProbabilitySimplexOracle(1); +# lmo = FrankWolfe.ProbabilitySimplexOracle(1.0); +# lmo = FrankWolfe.UnitSimplexOracle(1.0); +# lmo = FrankWolfe.UnitSimplexOracle(1.0); +lmo = FrankWolfe.KSparseLMO(40, 1.0); + x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n)); FrankWolfe.benchmark_oracles( From 8278c1a25bd4d8afbe61dc3f3922636234865070 Mon Sep 17 00:00:00 2001 From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com> Date: Sat, 17 Jul 2021 00:19:42 +0200 Subject: [PATCH 07/13] more tests --- examples/tracing_gamma.jl | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/examples/tracing_gamma.jl b/examples/tracing_gamma.jl index d7a47bc38..606a9448c 100644 --- a/examples/tracing_gamma.jl +++ b/examples/tracing_gamma.jl @@ -3,8 +3,9 @@ using FrankWolfe using ProgressMeter using Plots -n = Int(1e5) -k = 1000 +n = Int(1e2) +k = Int(1e6) +eps=1e-8 xpi = rand(n); total = sum(xpi); @@ -20,10 +21,11 @@ function cgrad!(storage, x, xp) return @. storage = 2 * (x - xp) end -# lmo = FrankWolfe.ProbabilitySimplexOracle(1.0); +lmo = FrankWolfe.ProbabilitySimplexOracle(1.0); # lmo = FrankWolfe.UnitSimplexOracle(1.0); # lmo = FrankWolfe.UnitSimplexOracle(1.0); -lmo = FrankWolfe.KSparseLMO(40, 1.0); +# lmo = FrankWolfe.KSparseLMO(40, 1.0); +# lmo = FrankWolfe.LpNormLMO{2}(1.0) x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n)); @@ -57,6 +59,7 @@ callback = build_callback(trajectory_ag) x0, max_iteration=k, trajectory=true, + epsilon=eps, line_search=FrankWolfe.Agnostic(), print_iter=k / 10, callback=callback, @@ -79,6 +82,7 @@ callback = build_callback(trajectory_ada) x0, max_iteration=k, trajectory=true, + epsilon=eps, line_search=FrankWolfe.Adaptive(), print_iter=k / 10, callback=callback, @@ -101,6 +105,7 @@ callback = build_callback(trajectory_ls) x0, max_iteration=k, trajectory=true, + epsilon=eps, line_search=FrankWolfe.Shortstep(), print_iter=k / 10, L=2, @@ -109,11 +114,13 @@ callback = build_callback(trajectory_ls) verbose=true, ); -x = [trajectory_ag[i][1]+1 for i in eachindex(trajectory_ag)] +x_ag = [trajectory_ag[i][1]+1 for i in eachindex(trajectory_ag)] gamma_ag = [trajectory_ag[i][6] for i in eachindex(trajectory_ag)] +x_ada = [trajectory_ada[i][1]+1 for i in eachindex(trajectory_ada)] gamma_ada = [trajectory_ada[i][6] for i in eachindex(trajectory_ada)] +x_ls = [trajectory_ls[i][1]+1 for i in eachindex(trajectory_ls)] gamma_ls = [trajectory_ls[i][6] for i in eachindex(trajectory_ls)] -Plots.plot(x,gamma_ag,label="gamma_ag", yaxis=:log, xaxis=:log) -Plots.plot!(x,gamma_ada,label="gamma_ada", yaxis=:log, xaxis=:log) -Plots.plot!(x,gamma_ls,label="gamma_ls", yaxis=:log, xaxis=:log) +Plots.plot(x_ag,gamma_ag,label="gamma_ag", yaxis=:log, xaxis=:log) +Plots.plot!(x_ada,gamma_ada,label="gamma_ada", yaxis=:log, xaxis=:log) +Plots.plot!(x_ls,gamma_ls,label="gamma_ls", yaxis=:log, xaxis=:log) From 1d33e3183e9c783e8b382b6ac6a10bdfabbb7456 Mon Sep 17 00:00:00 2001 From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com> Date: Sat, 17 Jul 2021 00:21:17 +0200 Subject: [PATCH 08/13] temp --- examples/tracing_gamma.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tracing_gamma.jl b/examples/tracing_gamma.jl index 606a9448c..ff9555375 100644 --- a/examples/tracing_gamma.jl +++ b/examples/tracing_gamma.jl @@ -4,7 +4,7 @@ using ProgressMeter using Plots n = Int(1e2) -k = Int(1e6) +k = Int(1e4) eps=1e-8 xpi = rand(n); From ebd459d18e26b7d0c125ab5101df59e8a8e5375a Mon Sep 17 00:00:00 2001 From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com> Date: Thu, 22 Jul 2021 00:10:29 +0200 Subject: [PATCH 09/13] minor verbose --- src/fw_algorithms.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index c896c2a32..37f1b1b7c 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -55,7 +55,7 @@ function frank_wolfe( println("FATAL: gamma0 not set. We are not going to move a single bit.") end - if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep}) + if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep} && verbose) println( "WARNING: Momentum-averaged gradients should usually be used with agnostic stepsize rules.", ) From d3ed5f9dbe38343bf7bbcc766b83cd1262542f88 Mon Sep 17 00:00:00 2001 From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com> Date: Thu, 22 Jul 2021 01:45:25 +0200 Subject: [PATCH 10/13] added ada --- src/fw_algorithms.jl | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index 37f1b1b7c..90dbce7fd 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -30,6 +30,8 @@ function frank_wolfe( gradient=nothing, callback=nothing, timeout=Inf, + ada=false, + ada_eps=1e-8, print_callback=print_callback, ) @@ -96,6 +98,11 @@ function frank_wolfe( else similar(x) end + + # H and H2 container + H = similar(gradient) * 0.0 + H2 = similar(gradient) * 0.0 + while t <= max_iteration && dual_gap >= max(epsilon, eps()) ##################### @@ -131,6 +138,14 @@ function frank_wolfe( end first_iter = false + # H2 = H2+est_grad_f_x**2 + # H = eps+np.sqrt(H2) + if ada && momentum === nothing + H2 += gradient.^2 + H = (ada_eps .+ (H2.^(0.5))).^(-1.0) + gradient = gradient .* H + end + v = compute_extreme_point(lmo, gradient) # go easy on the memory - only compute if really needed if ( From c979e849d353fc062a2f36a3e3c81568dc6679b2 Mon Sep 17 00:00:00 2001 From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com> Date: Thu, 22 Jul 2021 01:54:05 +0200 Subject: [PATCH 11/13] removed ada - bad performance --- src/fw_algorithms.jl | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index 90dbce7fd..4e187f49d 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -30,8 +30,6 @@ function frank_wolfe( gradient=nothing, callback=nothing, timeout=Inf, - ada=false, - ada_eps=1e-8, print_callback=print_callback, ) @@ -99,9 +97,9 @@ function frank_wolfe( similar(x) end - # H and H2 container - H = similar(gradient) * 0.0 - H2 = similar(gradient) * 0.0 + # # H and H2 container + # H = similar(gradient) * 0.0 + # H2 = similar(gradient) * 0.0 while t <= max_iteration && dual_gap >= max(epsilon, eps()) @@ -138,13 +136,13 @@ function frank_wolfe( end first_iter = false - # H2 = H2+est_grad_f_x**2 - # H = eps+np.sqrt(H2) - if ada && momentum === nothing - H2 += gradient.^2 - H = (ada_eps .+ (H2.^(0.5))).^(-1.0) - gradient = gradient .* H - end + # # H2 = H2+est_grad_f_x**2 + # # H = eps+np.sqrt(H2) + # if ada && momentum === nothing + # H2 += gradient.^2 + # H = (ada_eps .+ (H2.^(0.5))).^(-1.0) + # gradient = gradient .* H + # end v = compute_extreme_point(lmo, gradient) # go easy on the memory - only compute if really needed From 2636093c590f69dc9ed5903ff7a599349a1b4038 Mon Sep 17 00:00:00 2001 From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com> Date: Thu, 22 Jul 2021 02:00:45 +0200 Subject: [PATCH 12/13] fix in tracing_gamma --- examples/tracing_gamma.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/tracing_gamma.jl b/examples/tracing_gamma.jl index ff9555375..4080c63e6 100644 --- a/examples/tracing_gamma.jl +++ b/examples/tracing_gamma.jl @@ -1,5 +1,7 @@ +include("activate.jl") + using LinearAlgebra -using FrankWolfe +# using FrankWolfe using ProgressMeter using Plots From 6fad9659910f0bd3e6bc6381b8c1d0c799d29dea Mon Sep 17 00:00:00 2001 From: Sebastian Pokutta <23001135+pokutta@users.noreply.github.com> Date: Thu, 22 Jul 2021 13:37:17 +0200 Subject: [PATCH 13/13] fix to ci --- examples/tracing_gamma.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tracing_gamma.jl b/examples/tracing_gamma.jl index 4080c63e6..c79b59e8d 100644 --- a/examples/tracing_gamma.jl +++ b/examples/tracing_gamma.jl @@ -2,8 +2,8 @@ include("activate.jl") using LinearAlgebra # using FrankWolfe -using ProgressMeter -using Plots +# using ProgressMeter +# using Plots n = Int(1e2) k = Int(1e4)