diff --git a/test/benchmark.jl b/test/benchmark.jl
index d4c7ef0..8dcb4a4 100644
--- a/test/benchmark.jl
+++ b/test/benchmark.jl
@@ -51,20 +51,40 @@ const TEST_BENCHMARK = true
         
         @test f1() == f2() || f1() ≈ f2()
         
+        bench1 = @benchmarkable $f1()
+        bench2 = @benchmarkable $f2()
+
+        # we tune the first benchmark, and use the same tuned parameters for the second benchmark, to make the comparison fair.
+        tune!(bench1)
+        bench2.params = bench1.params
+
+        b1 = run(bench1)
+        b2 = run(bench2)
         
-        t1 = @belapsed $f1() seconds=1
-        t2 = @belapsed $f2() seconds=1
-        
+        t1 = mean(b1).time
+        t2 = mean(b2).time
+
+        σ1 = BenchmarkTools.std(b1).time
+        σ2 = BenchmarkTools.std(b2).time
+
+        tdiff = t1 - t2
+        σdiff = sqrt(σ1^2 + σ2^2)
+
         ratio = t1 / t2
         
-        @info "Expr $i" ratio t1 t2 
+        @info "Expr $i" t1 t2 ratio diff=Text("$round(Int64, tdiff) ± $round(Int64, σdiff))") b1 b2
         
         if TEST_BENCHMARK
-            @test ratio < 1.2
+            # we should be faster, i.e.
+            # @test tdiff < 0
+
+            # and we have an admissible error of 2.5%
+            @test tdiff < 2*σdiff
         end
     end
     
     m.stop(w)
+    Distributed.rmprocs(p; waitfor=30)
 end
 
 
@@ -74,16 +94,15 @@ end
         w = m.Worker()
         @assert(2 == m.remotecall_fetch(+, w, 1, 1))
         m.stop(w)
-        isdefined(m, :_wait_for_exit) || return
-        m._wait_for_exit(w)
     end
 
     function launch_with_distributed()
         p = Distributed.addprocs(1) |> only
         @assert(2 == Distributed.remotecall_fetch(+, p, 1, 1))
-        Distributed.rmprocs(p) |> wait
+        Distributed.rmprocs(p; waitfor=30)
     end
     
+    # run once to precompile
     launch_with_malt()
     launch_with_distributed()