diff --git a/.gitignore b/.gitignore
index 80eaef87b..89820ea8a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,4 @@ examples/precompile_fw_clean.jl
 examples/precompile_fw.jl
 cc/libloporacle.so
 cc/*
+examples/temp.jl
diff --git a/examples/tracing_gamma.jl b/examples/tracing_gamma.jl
new file mode 100644
index 000000000..c79b59e8d
--- /dev/null
+++ b/examples/tracing_gamma.jl
@@ -0,0 +1,128 @@
+include("activate.jl")
+
+using LinearAlgebra
+# using FrankWolfe
+# using ProgressMeter
+# using Plots
+
+n = Int(1e2)
+k = Int(1e4)
+eps=1e-8
+
+xpi = rand(n);
+total = sum(xpi);
+const xp = xpi ./ total;
+
+# better for memory consumption as we do coordinate-wise ops
+
+function cf(x, xp)
+    return LinearAlgebra.norm(x .- xp)^2
+end
+
+function cgrad!(storage, x, xp)
+    return @. storage = 2 * (x - xp)
+end
+
+lmo = FrankWolfe.ProbabilitySimplexOracle(1.0);
+# lmo = FrankWolfe.UnitSimplexOracle(1.0);
+# lmo = FrankWolfe.UnitSimplexOracle(1.0);
+# lmo = FrankWolfe.KSparseLMO(40, 1.0);
+# lmo = FrankWolfe.LpNormLMO{2}(1.0)
+
+x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n));
+
+FrankWolfe.benchmark_oracles(
+    x -> cf(x, xp),
+    (str, x) -> cgrad!(str, x, xp),
+    () -> randn(n),
+    lmo;
+    k=100,
+)
+
+
+function build_callback(storage)
+    return function callback(data)
+        return push!(storage, (Tuple(data)[1:5]...,data.gamma))
+    end
+end
+
+
+####### 2/(2+t) rule
+
+x0 = copy(x00)
+
+trajectory_ag = Vector{Tuple{Int64,Float64,Float64,Float64,Float64,Float64}}()
+callback = build_callback(trajectory_ag)
+
+@time x, v, primal, dual_gap = FrankWolfe.frank_wolfe(
+    x -> cf(x, xp),
+    (str, x) -> cgrad!(str, x, xp),
+    lmo,
+    x0,
+    max_iteration=k,
+    trajectory=true,
+    epsilon=eps,
+    line_search=FrankWolfe.Agnostic(),
+    print_iter=k / 10,
+    callback=callback,
+    emphasis=FrankWolfe.memory,
+    verbose=true,
+);
+
+
+####### adaptive
+
+x0 = copy(x00)
+
+trajectory_ada = Vector{Tuple{Int64,Float64,Float64,Float64,Float64,Float64}}()
+callback = build_callback(trajectory_ada)
+
+@time x, v, primal, dual_gap = FrankWolfe.frank_wolfe(
+    x -> cf(x, xp),
+    (str, x) -> cgrad!(str, x, xp),
+    lmo,
+    x0,
+    max_iteration=k,
+    trajectory=true,
+    epsilon=eps,
+    line_search=FrankWolfe.Adaptive(),
+    print_iter=k / 10,
+    callback=callback,
+    emphasis=FrankWolfe.memory,
+    verbose=true,
+);
+
+
+####### backtracking
+
+x0 = copy(x00)
+
+trajectory_ls = Vector{Tuple{Int64,Float64,Float64,Float64,Float64,Float64}}()
+callback = build_callback(trajectory_ls)
+
+@time x, v, primal, dual_gap = FrankWolfe.frank_wolfe(
+    x -> cf(x, xp),
+    (str, x) -> cgrad!(str, x, xp),
+    lmo,
+    x0,
+    max_iteration=k,
+    trajectory=true,
+    epsilon=eps,
+    line_search=FrankWolfe.Shortstep(),
+    print_iter=k / 10,
+    L=2,
+    callback=callback,
+    emphasis=FrankWolfe.memory,
+    verbose=true,
+);
+
+x_ag = [trajectory_ag[i][1]+1 for i in eachindex(trajectory_ag)]
+gamma_ag = [trajectory_ag[i][6] for i in eachindex(trajectory_ag)]
+x_ada = [trajectory_ada[i][1]+1 for i in eachindex(trajectory_ada)]
+gamma_ada = [trajectory_ada[i][6] for i in eachindex(trajectory_ada)]
+x_ls = [trajectory_ls[i][1]+1 for i in eachindex(trajectory_ls)]
+gamma_ls = [trajectory_ls[i][6] for i in eachindex(trajectory_ls)]
+
+Plots.plot(x_ag,gamma_ag,label="gamma_ag", yaxis=:log, xaxis=:log)
+Plots.plot!(x_ada,gamma_ada,label="gamma_ada", yaxis=:log, xaxis=:log)
+Plots.plot!(x_ls,gamma_ls,label="gamma_ls", yaxis=:log, xaxis=:log)
diff --git a/examples/warmstart_lmo.jl b/examples/warmstart_lmo.jl
new file mode 100644
index 000000000..cebb5369d
--- /dev/null
+++ b/examples/warmstart_lmo.jl
@@ -0,0 +1,62 @@
+include("activate.jl")
+
+using LinearAlgebra
+
+n = Int(1e4)
+k = 10000
+
+xpi = rand(n);
+total = sum(xpi);
+const xp = xpi # ./ total;
+
+f(x) = norm(x - xp)^2
+function grad!(storage, x)
+    @. storage = 2 * (x - xp)
+end
+
+# better for memory consumption as we do coordinate-wise ops
+
+function cf(x, xp)
+    return LinearAlgebra.norm(x .- xp)^2
+end
+
+function cgrad!(storage, x, xp)
+    return @. storage = 2 * (x - xp)
+end
+
+# lmo = FrankWolfe.ProbabilitySimplexOracle(1);
+
+lmo = FrankWolfe.KSparseLMO(100, 1.0)
+x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n));
+
+# arbitrary cache
+
+x0 = deepcopy(x00)
+
+@time x, v, primal, dual_gap, trajectory, lmo  = FrankWolfe.lazified_conditional_gradient(
+    x -> cf(x, xp),
+    (str, x) -> cgrad!(str, x, xp),
+    lmo,
+    x0,
+    max_iteration=k,
+    L=100,
+    line_search=FrankWolfe.Adaptive(),
+    print_iter=k / 10,
+    emphasis=FrankWolfe.memory,
+    verbose=true,
+);
+
+@time x, v, primal, dual_gap, trajectory, lmo  = FrankWolfe.lazified_conditional_gradient(
+    x -> cf(x, xp),
+    (str, x) -> cgrad!(str, x, xp),
+    lmo,
+    x0,
+    max_iteration=k,
+    L=100,
+    line_search=FrankWolfe.Adaptive(),
+    print_iter=k / 10,
+    emphasis=FrankWolfe.memory,
+    verbose=true,
+    warmstart_lmo = lmo
+);
+
diff --git a/src/fw_algorithms.jl b/src/fw_algorithms.jl
index 5799356e7..4e187f49d 100644
--- a/src/fw_algorithms.jl
+++ b/src/fw_algorithms.jl
@@ -55,7 +55,7 @@ function frank_wolfe(
         println("FATAL: gamma0 not set. We are not going to move a single bit.")
     end
 
-    if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep})
+    if (!isnothing(momentum) && line_search isa Union{Shortstep,Adaptive,RationalShortstep} && verbose)
         println(
             "WARNING: Momentum-averaged gradients should usually be used with agnostic stepsize rules.",
         )
@@ -96,6 +96,11 @@ function frank_wolfe(
     else
         similar(x)
     end
+
+    # # H and H2 container
+    # H = similar(gradient) * 0.0
+    # H2 = similar(gradient) * 0.0
+
     while t <= max_iteration && dual_gap >= max(epsilon, eps())
 
         #####################
@@ -131,6 +136,14 @@ function frank_wolfe(
         end
         first_iter = false
 
+        # # H2 = H2+est_grad_f_x**2
+        # # H = eps+np.sqrt(H2)
+        # if ada && momentum === nothing
+        #     H2 += gradient.^2
+        #     H = (ada_eps .+ (H2.^(0.5))).^(-1.0)
+        #     gradient = gradient .* H
+        # end
+
         v = compute_extreme_point(lmo, gradient)
         # go easy on the memory - only compute if really needed
         if (
@@ -248,6 +261,7 @@ function lazified_conditional_gradient(
     print_iter=1000,
     trajectory=false,
     verbose=false,
+    verbose_it=false,
     linesearch_tol=1e-7,
     step_lim=20,
     emphasis::Emphasis=memory,
@@ -256,15 +270,20 @@ function lazified_conditional_gradient(
     callback=nothing,
     timeout=Inf,
     print_callback=print_callback,
+    warmstart_lmo=nothing,
 )
 
     # format string for output of the algorithm
     format_string = "%6s %13s %14e %14e %14e %14e %14e %14i\n"
 
-    if isfinite(cache_size)
-        lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base)
+    if warmstart_lmo === nothing
+        if isfinite(cache_size)
+            lmo = MultiCacheLMO{cache_size,typeof(lmo_base),VType}(lmo_base)
+        else
+            lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base)
+        end
     else
-        lmo = VectorCacheLMO{typeof(lmo_base),VType}(lmo_base)
+        lmo = warmstart_lmo
     end
 
     t = 0
@@ -347,7 +366,7 @@ function lazified_conditional_gradient(
         threshold = fast_dot(x, gradient) - phi / K
 
         # go easy on the memory - only compute if really needed
-        if ((mod(t, print_iter) == 0 && verbose ) || callback !== nothing)
+        if ((mod(t, print_iter) == 0 && (verbose || verbose_it) ) || callback !== nothing)
             primal = f(x)
         end
 
@@ -394,7 +413,7 @@ function lazified_conditional_gradient(
 
         @emphasis(emphasis, x = x - gamma * d)
 
-        if verbose && (mod(t, print_iter) == 0 || tt == dualstep)
+        if (verbose  || verbose_it) && (mod(t, print_iter) == 0 || tt == dualstep)
             if t == 0
                 tt = initial
             end
@@ -422,7 +441,7 @@ function lazified_conditional_gradient(
     primal = f(x)
     dual_gap = fast_dot(x, gradient) - fast_dot(v, gradient)
 
-    if verbose 
+    if verbose  || verbose_it
         tt = last
         tot_time = (time_ns() - time_start) / 1.0e9
         rep = (
@@ -436,10 +455,12 @@ function lazified_conditional_gradient(
             length(lmo),
         )
         print_callback(rep, format_string)
-        print_callback(nothing, format_string, print_footer=true)
+        if verbose
+            print_callback(nothing, format_string, print_footer=true)
+        end
         flush(stdout)
     end
-    return x, v, primal, dual_gap, traj_data
+    return x, v, primal, dual_gap, traj_data, lmo
 end
 
 """