diff --git a/.gitignore b/.gitignore index 80eaef87b..89820ea8a 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ examples/precompile_fw_clean.jl examples/precompile_fw.jl cc/libloporacle.so cc/* +examples/temp.jl diff --git a/examples/tracing_gamma.jl b/examples/tracing_gamma.jl new file mode 100644 index 000000000..c79b59e8d --- /dev/null +++ b/examples/tracing_gamma.jl @@ -0,0 +1,128 @@ +include("activate.jl") + +using LinearAlgebra +# using FrankWolfe +# using ProgressMeter +# using Plots + +n = Int(1e2) +k = Int(1e4) +eps=1e-8 + +xpi = rand(n); +total = sum(xpi); +const xp = xpi ./ total; + +# better for memory consumption as we do coordinate-wise ops + +function cf(x, xp) + return LinearAlgebra.norm(x .- xp)^2 +end + +function cgrad!(storage, x, xp) + return @. storage = 2 * (x - xp) +end + +lmo = FrankWolfe.ProbabilitySimplexOracle(1.0); +# lmo = FrankWolfe.UnitSimplexOracle(1.0); +# lmo = FrankWolfe.UnitSimplexOracle(1.0); +# lmo = FrankWolfe.KSparseLMO(40, 1.0); +# lmo = FrankWolfe.LpNormLMO{2}(1.0) + +x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n)); + +FrankWolfe.benchmark_oracles( + x -> cf(x, xp), + (str, x) -> cgrad!(str, x, xp), + () -> randn(n), + lmo; + k=100, +) + + +function build_callback(storage) + return function callback(data) + return push!(storage, (Tuple(data)[1:5]...,data.gamma)) + end +end + + +####### 2/(2+t) rule + +x0 = copy(x00) + +trajectory_ag = Vector{Tuple{Int64,Float64,Float64,Float64,Float64,Float64}}() +callback = build_callback(trajectory_ag) + +@time x, v, primal, dual_gap = FrankWolfe.frank_wolfe( + x -> cf(x, xp), + (str, x) -> cgrad!(str, x, xp), + lmo, + x0, + max_iteration=k, + trajectory=true, + epsilon=eps, + line_search=FrankWolfe.Agnostic(), + print_iter=k / 10, + callback=callback, + emphasis=FrankWolfe.memory, + verbose=true, +); + + +####### adaptive + +x0 = copy(x00) + +trajectory_ada = Vector{Tuple{Int64,Float64,Float64,Float64,Float64,Float64}}() +callback = build_callback(trajectory_ada) + +@time x, v, primal, dual_gap = FrankWolfe.frank_wolfe( + x -> cf(x, xp), + (str, x) -> cgrad!(str, x, xp), + lmo, + x0, + max_iteration=k, + trajectory=true, + epsilon=eps, + line_search=FrankWolfe.Adaptive(), + print_iter=k / 10, + callback=callback, + emphasis=FrankWolfe.memory, + verbose=true, +); + + +####### backtracking + +x0 = copy(x00) + +trajectory_ls = Vector{Tuple{Int64,Float64,Float64,Float64,Float64,Float64}}() +callback = build_callback(trajectory_ls) + +@time x, v, primal, dual_gap = FrankWolfe.frank_wolfe( + x -> cf(x, xp), + (str, x) -> cgrad!(str, x, xp), + lmo, + x0, + max_iteration=k, + trajectory=true, + epsilon=eps, + line_search=FrankWolfe.Shortstep(), + print_iter=k / 10, + L=2, + callback=callback, + emphasis=FrankWolfe.memory, + verbose=true, +); + +x_ag = [trajectory_ag[i][1]+1 for i in eachindex(trajectory_ag)] +gamma_ag = [trajectory_ag[i][6] for i in eachindex(trajectory_ag)] +x_ada = [trajectory_ada[i][1]+1 for i in eachindex(trajectory_ada)] +gamma_ada = [trajectory_ada[i][6] for i in eachindex(trajectory_ada)] +x_ls = [trajectory_ls[i][1]+1 for i in eachindex(trajectory_ls)] +gamma_ls = [trajectory_ls[i][6] for i in eachindex(trajectory_ls)] + +Plots.plot(x_ag,gamma_ag,label="gamma_ag", yaxis=:log, xaxis=:log) +Plots.plot!(x_ada,gamma_ada,label="gamma_ada", yaxis=:log, xaxis=:log) +Plots.plot!(x_ls,gamma_ls,label="gamma_ls", yaxis=:log, xaxis=:log) diff --git a/examples/warmstart_lmo.jl b/examples/warmstart_lmo.jl new file mode 100644 index 000000000..cebb5369d --- /dev/null +++ b/examples/warmstart_lmo.jl @@ -0,0 +1,62 @@ +include("activate.jl") + +using LinearAlgebra + +n = Int(1e4) +k = 10000 + +xpi = rand(n); +total = sum(xpi); +const xp = xpi # ./ total; + +f(x) = norm(x - xp)^2 +function grad!(storage, x) + @. storage = 2 * (x - xp) +end + +# better for memory consumption as we do coordinate-wise ops + +function cf(x, xp) + return LinearAlgebra.norm(x .- xp)^2 +end + +function cgrad!(storage, x, xp) + return @. storage = 2 * (x - xp) +end + +# lmo = FrankWolfe.ProbabilitySimplexOracle(1); + +lmo = FrankWolfe.KSparseLMO(100, 1.0) +x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n)); + +# arbitrary cache + +x0 = deepcopy(x00) + +@time x, v, primal, dual_gap, trajectory, lmo = FrankWolfe.lazified_conditional_gradient( + x -> cf(x, xp), + (str, x) -> cgrad!(str, x, xp), + lmo, + x0, + max_iteration=k, + L=100, + line_search=FrankWolfe.Adaptive(), + print_iter=k / 10, + emphasis=FrankWolfe.memory, + verbose=true, +); + +@time x, v, primal, dual_gap, trajectory, lmo = FrankWolfe.lazified_conditional_gradient( + x -> cf(x, xp), + (str, x) -> cgrad!(str, x, xp), + lmo, + x0, + max_iteration=k, + L=100, + line_search=FrankWolfe.Adaptive(), + print_iter=k / 10, + emphasis=FrankWolfe.memory, + verbose=true, + warmstart_lmo = lmo +); + diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl index 5799356e7..4e187f49d 100644 --- a/src/fw_algorithms.jl +++ b/src/fw_algorithms.jl @@ -55,7 +55,7 @@ function frank_wolfe( println("FATAL: gamma0 not set. We are not going to move a single bit.") end - if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep}) + if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep} && verbose) println( "WARNING: Momentum-averaged gradients should usually be used with agnostic stepsize rules.", ) @@ -96,6 +96,11 @@ function frank_wolfe( else similar(x) end + + # # H and H2 container + # H = similar(gradient) * 0.0 + # H2 = similar(gradient) * 0.0 + while t <= max_iteration && dual_gap >= max(epsilon, eps()) ##################### @@ -131,6 +136,14 @@ function frank_wolfe( end first_iter = false + # # H2 = H2+est_grad_f_x**2 + # # H = eps+np.sqrt(H2) + # if ada && momentum === nothing + # H2 += gradient.^2 + # H = (ada_eps .+ (H2.^(0.5))).^(-1.0) + # gradient = gradient .* H + # end + v = compute_extreme_point(lmo, gradient) # go easy on the memory - only compute if really needed if ( @@ -248,6 +261,7 @@ function lazified_conditional_gradient( print_iter=1000, trajectory=false, verbose=false, + verbose_it=false, linesearch_tol=1e-7, step_lim=20, emphasis::Emphasis=memory, @@ -256,15 +270,20 @@ function lazified_conditional_gradient( callback=nothing, timeout=Inf, print_callback=print_callback, + warmstart_lmo=nothing, ) # format string for output of the algorithm format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n" - if isfinite(cache_size) - lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base) + if warmstart_lmo === nothing + if isfinite(cache_size) + lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base) + else + lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base) + end else - lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base) + lmo = warmstart_lmo end t = 0 @@ -347,7 +366,7 @@ function lazified_conditional_gradient( threshold = fast_dot(x, gradient) - phi / K # go easy on the memory - only compute if really needed - if ((mod(t, print_iter) == 0 && verbose ) || callback !== nothing) + if ((mod(t, print_iter) == 0 && (verbose || verbose_it) ) || callback !== nothing) primal = f(x) end @@ -394,7 +413,7 @@ function lazified_conditional_gradient( @emphasis(emphasis, x = x - gamma * d) - if verbose && (mod(t, print_iter) == 0 || tt == dualstep) + if (verbose || verbose_it) && (mod(t, print_iter) == 0 || tt == dualstep) if t == 0 tt = initial end @@ -422,7 +441,7 @@ function lazified_conditional_gradient( primal = f(x) dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient) - if verbose + if verbose || verbose_it tt = last tot_time = (time_ns() - time_start) / 1.0e9 rep = ( @@ -436,10 +455,12 @@ function lazified_conditional_gradient( length(lmo), ) print_callback(rep, format_string) - print_callback(nothing, format_string, print_footer=true) + if verbose + print_callback(nothing, format_string, print_footer=true) + end flush(stdout) end - return x, v, primal, dual_gap, traj_data + return x, v, primal, dual_gap, traj_data, lmo end """