Skip to content

Commit

Permalink
fix: update GravitationalWaveform tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Sep 22, 2024
1 parent 0327c22 commit 92e8469
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 13 deletions.
8 changes: 4 additions & 4 deletions examples/GravitationalWaveForm/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
OptimizationOptimJL = "36348300-93cb-4f02-beb5-3c3902f8871e"
OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
OrdinaryDiffEqLowOrderRK = "1344f307-1e59-4825-a18e-ace9aa3fa4c6"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SciMLSensitivity = "1ed8b502-d754-442c-8d5d-10ac956f44a1"
Expand All @@ -18,7 +18,7 @@ ComponentArrays = "0.15"
LineSearches = "7"
Literate = "2"
Lux = "1"
Optimization = "3"
OptimizationOptimJL = "0.3"
OrdinaryDiffEq = "6"
Optimization = "4"
OptimizationOptimJL = "0.4"
OrdinaryDiffEqLowOrderRK = "1"
SciMLSensitivity = "7.57"
18 changes: 9 additions & 9 deletions examples/GravitationalWaveForm/main.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

# ## Package Imports

using Lux, ComponentArrays, LineSearches, OrdinaryDiffEq, Optimization, OptimizationOptimJL,
Printf, Random, SciMLSensitivity
using Lux, ComponentArrays, LineSearches, OrdinaryDiffEqLowOrderRK, Optimization,
OptimizationOptimJL, Printf, Random, SciMLSensitivity
using CairoMakie

# ## Define some Utility Functions
Expand Down Expand Up @@ -221,16 +221,16 @@ end

# We will deviate from the standard Neural Network initialization and use
# `WeightInitializers.jl`,
const nn = Chain(Base.Fix1(broadcast, cos),
Dense(1 => 32, cos; init_weight=truncated_normal(; std=1e-4)),
Dense(32 => 32, cos; init_weight=truncated_normal(; std=1e-4)),
Dense(32 => 2; init_weight=truncated_normal(; std=1e-4)))
ps, st = Lux.setup(Xoshiro(), nn)
const nn = Chain(Base.Fix1(fast_activation, cos),
Dense(1 => 32, cos; init_weight=truncated_normal(; std=1e-4), init_bias=zeros32),
Dense(32 => 32, cos; init_weight=truncated_normal(; std=1e-4), init_bias=zeros32),
Dense(32 => 2; init_weight=truncated_normal(; std=1e-4), init_bias=zeros32))
ps, st = Lux.setup(Random.default_rng(), nn)

# Similar to most DL frameworks, Lux defaults to using `Float32`, however, in this case we
# need Float64

const params = ComponentArray{Float64}(ps)
const params = ComponentArray(ps |> f64)

const nn_model = StatefulLuxLayer{true}(nn, nothing, st)

Expand Down Expand Up @@ -293,7 +293,7 @@ const mseloss = MSELoss()
function loss(θ)
pred = Array(solve(prob_nn, RK4(); u0, p=θ, saveat=tsteps, dt, adaptive=false))
pred_waveform = first(compute_waveform(dt_data, pred, mass_ratio, ode_model_params))
return mseloss(waveform, pred_waveform), pred_waveform
return mseloss(pred_waveform, waveform), pred_waveform
end

# Warmup the loss function
Expand Down

1 comment on commit 92e8469

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 92e8469 Previous: 0327c22 Ratio
Dense(512 => 512, identity)(512 x 128)/forward/CPU/2 thread(s) 415000 ns 415000 ns 1
Dense(512 => 512, identity)(512 x 128)/forward/CPU/4 thread(s) 244167 ns 244000 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/8 thread(s) 243917 ns 244395.5 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/1 thread(s) 740083 ns 741625 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/GPU/CUDA 43793 ns 43299 ns 1.01
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/2 thread(s) 1280333 ns 1269375 ns 1.01
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/4 thread(s) 1268791 ns 1242021 ns 1.02
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/8 thread(s) 16455125 ns 16399916 ns 1.00
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/1 thread(s) 2193625.5 ns 2241187 ns 0.98
Dense(512 => 512, identity)(512 x 128)/zygote/GPU/CUDA 205231 ns 204002 ns 1.01
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/2 thread(s) 1311917 ns 1349042 ns 0.97
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/4 thread(s) 1301792 ns 1294500 ns 1.01
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/8 thread(s) 16522625 ns 16764541 ns 0.99
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/1 thread(s) 2229625 ns 2232041.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1672666 ns 1752375 ns 0.95
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1078166 ns 1093646 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1511041.5 ns 1492959 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2994458 ns 3024125 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 207884 ns 207489.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12154146 ns 12156687.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 8856791 ns 8836167 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9297792 ns 9199229.5 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18579708 ns 18606000 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1492665 ns 1487095.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17297396 ns 17315979 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 13998833 ns 13972625 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14511000 ns 14444750 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21839416 ns 21832209 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 250544729 ns 250603708 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 148581208 ns 148666500 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 116355916.5 ns 116680792 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 447348667 ns 446998834 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5449372 ns 5475042 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1226769166 ns 1221297083 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 930331417 ns 932803916 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 829560312.5 ns 830875312.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1631272125 ns 1633169958 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 31620503.5 ns 31270992 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1143568125 ns 1142742084 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 993275583.5 ns 992312021 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1332092333.5 ns 1338745271 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1732940916.5 ns 1731363313 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 1119875 ns 1118896 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 1650333 ns 1648500 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 3433334 ns 3550750 ns 0.97
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 782354 ns 779895.5 ns 1.00
lenet(28, 28, 1, 32)/forward/GPU/CUDA 263984.5 ns 263872 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2986166 ns 2989959 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 4134521 ns 4148875 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 9684479 ns 10113125 ns 0.96
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3141166 ns 3158708.5 ns 0.99
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1099110 ns 1091990.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 2222125 ns 2343042 ns 0.95
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1310979 ns 1334167 ns 0.98
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1561042 ns 1566042 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 4207458 ns 4207541 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 208127 ns 208150 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 19407062.5 ns 19429625 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 16092937.5 ns 16109333 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 17317479 ns 17422500 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 25877354.5 ns 25879770.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1588570 ns 1587958 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 34283042 ns 34137833 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 31029667 ns 30953770.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 31324334 ns 31126834 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 36972625 ns 36603292 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 4535728.5 ns 4553458 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2550437.5 ns 2548333 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2682521 ns 2680479 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 8376542 ns 8392292 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 420059 ns 419498 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 38787729 ns 39173771 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 32133646 ns 32069583 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 32252916 ns 32250333 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 51916459 ns 51921166 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2624143 ns 2616748 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 88908791 ns 89448084 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 114840750 ns 115210625 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 227998375 ns 223006041 ns 1.02
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 74777479 ns 74834709 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 269000958 ns 268830916 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 156605625 ns 155940083 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 123282250 ns 123481354.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 485266417 ns 485017458 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7007944 ns 7045661 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1477600500.5 ns 1468709770.5 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 1177860417 ns 1170852875 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 1059255604.5 ns 1060936145.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 2001527437.5 ns 2003258354 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34509709 ns 34686788.5 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1725457125 ns 1717082916 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1535708771 ns 1544192313 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1892793750 ns 1911836416 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 2208396292 ns 2210230895.5 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 2072875 ns 2101000.5 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 3011791 ns 3069792 ns 0.98
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 8320459 ns 7782562.5 ns 1.07
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2450499.5 ns 2470375 ns 0.99
lenet(28, 28, 1, 128)/forward/GPU/CUDA 268533.5 ns 266658.5 ns 1.01
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 9519292 ns 9672229 ns 0.98
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 12095020.5 ns 12092375 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 24991500 ns 24297291.5 ns 1.03
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 11770084 ns 11738333 ns 1.00
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1173232 ns 1162784.5 ns 1.01
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 383052437.5 ns 379589354 ns 1.01
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 311828042 ns 309610249.5 ns 1.01
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 269993541.5 ns 271697750 ns 0.99
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 452443833.5 ns 452186458.5 ns 1.00
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 4865362.5 ns 4826362 ns 1.01
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 1155538583 ns 1153829542 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 936810083 ns 937182416 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 959183583 ns 946442459 ns 1.01
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 1397577000 ns 1402726625 ns 1.00
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 19191910 ns 17871272 ns 1.07
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1053520.5 ns 1060583.5 ns 0.99
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 1668459 ns 1665104 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 5692083 ns 5684333 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1396104.5 ns 1293250 ns 1.08
lenet(28, 28, 1, 64)/forward/GPU/CUDA 270444.5 ns 266056.5 ns 1.02
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 6494584 ns 6298000 ns 1.03
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 13134333 ns 13078042 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 19522667 ns 19625729.5 ns 0.99
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 6062833 ns 6036063 ns 1.00
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1205114.5 ns 1205857 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 70593167 ns 70540979 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43687500 ns 43826125 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 39756500 ns 39756625 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 132546521 ns 132576562.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1861025.5 ns 1928339 ns 0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 356256979 ns 355456104 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 270180000 ns 270074458 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 253147750 ns 254482208.5 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 535028854 ns 534597416.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 12303646 ns 12296862 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 400021667 ns 395449625 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 374059625 ns 371290834 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 723689958.5 ns 729009728.5 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 712462250 ns 711332250 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 1195955667 ns 1188343167 ns 1.01
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 833640041.5 ns 829330374.5 ns 1.01
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 641220229.5 ns 641218812.5 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 1769113729 ns 1770400208.5 ns 1.00
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12497145 ns 12316126 ns 1.01
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 3639556520.5 ns 3615525250 ns 1.01
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 2825360333 ns 2829654958 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 2702765709 ns 2706402500 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 5019640833 ns 5029947166 ns 1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49951471 ns 49544504.5 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3421500 ns 3431458 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2074979 ns 2061979 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2545666 ns 2539292 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 6030125 ns 6030625 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 343299 ns 290543 ns 1.18
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 26132666.5 ns 25968458 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 19030500 ns 18987208 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 19345021 ns 19553000 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 39337834 ns 39349375 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2467033.5 ns 2461393.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 54504542 ns 54481083 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 81980333 ns 83276583.5 ns 0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 173279167 ns 179067500 ns 0.97
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 45606041 ns 45593208 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1787396 ns 1786750 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1095125 ns 1105896 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1559166 ns 1574250 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 3050791 ns 3030083 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 213819 ns 212590.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12546291 ns 12547583.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9225062.5 ns 9224833 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9642333.5 ns 9634083 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 19019500 ns 19000834 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1532922 ns 1540163 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17668667 ns 17638166.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14332167 ns 14340604.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14597000 ns 14590667 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 22175750.5 ns 22206375 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 70541417 ns 70473374.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43674667 ns 43776770.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 39704500 ns 39709833 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 132649271 ns 132558937.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1938611 ns 1941367.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 361084062.5 ns 360278666 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 347061583.5 ns 348229917 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 305013375 ns 304100458 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 723885708 ns 724915208 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13388921 ns 13369766.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 425519667 ns 420187917 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 427658750 ns 421516541 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 736440729.5 ns 709368500.5 ns 1.04
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 715989083 ns 716565750 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 1596542 ns 1595104.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 1135916 ns 1155875 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 1138166.5 ns 1141042 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 2412708 ns 2398709 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 587435 ns 589817.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 8847312 ns 8847459 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 13684021 ns 13657208.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 32863792 ns 33215000 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 9875083 ns 9861333.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1416297.5 ns 1439685.5 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 16549687.5 ns 16590416.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 22946333.5 ns 23377417 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 47499854 ns 48606521 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 13135792 ns 13167729 ns 1.00
Dense(512 => 512, relu)(512 x 128)/forward/CPU/2 thread(s) 827646 ns 827750 ns 1.00
Dense(512 => 512, relu)(512 x 128)/forward/CPU/4 thread(s) 514125 ns 572417 ns 0.90
Dense(512 => 512, relu)(512 x 128)/forward/CPU/8 thread(s) 1076104 ns 1030375 ns 1.04
Dense(512 => 512, relu)(512 x 128)/forward/CPU/1 thread(s) 725021 ns 723625 ns 1.00
Dense(512 => 512, relu)(512 x 128)/forward/GPU/CUDA 47722 ns 47526 ns 1.00
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/2 thread(s) 1531958 ns 1545916 ns 0.99
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/4 thread(s) 1005542 ns 1016104.5 ns 0.99
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/8 thread(s) 1422834 ns 1496709 ns 0.95
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/1 thread(s) 2290271 ns 2249771.5 ns 1.02
Dense(512 => 512, relu)(512 x 128)/zygote/GPU/CUDA 235161 ns 233321.5 ns 1.01
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/2 thread(s) 1550625 ns 1542666 ns 1.01
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/4 thread(s) 1063666.5 ns 1081042 ns 0.98
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/8 thread(s) 1456541 ns 1462375 ns 1.00
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/1 thread(s) 2260042 ns 2263999.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3417917 ns 3411417 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2065041 ns 2072375 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2482708 ns 2519729 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 6009500 ns 6003959 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 284432 ns 286692.5 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24080042 ns 24108583 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 17195500 ns 17195125 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 17121125 ns 17134396 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 37501854 ns 37539229 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2416353 ns 2409084 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 52890167 ns 52901854 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 84990875 ns 84174146 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 173811125 ns 176459666.5 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 44527208 ns 44549125 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 250510875 ns 250550666 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 148711500 ns 148903458 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 116106354 ns 116179937.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 447706104 ns 447642729 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5473947 ns 5446946 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1104910333 ns 1105006292 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 852696229 ns 855140500 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 828124666.5 ns 828516104.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1753883208 ns 1750786541 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 29129663 ns 28975639.5 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1027987062.5 ns 1027977458.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 967528166 ns 971875209 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1323494083.5 ns 1322670249.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1721562854.5 ns 1720633478.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1199000 ns 1103938 ns 1.09
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 722000 ns 681708 ns 1.06
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 723333.5 ns 780771 ns 0.93
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2059938 ns 2053729 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 566089.5 ns 565890 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 5883354 ns 5865542 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 9012521 ns 8901292 ns 1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 26898459 ns 27017958 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 7112042 ns 7111541 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1371381.5 ns 1352222 ns 1.01
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 9684083 ns 9684916.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 16051250 ns 16130125 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 33056542 ns 34128937.5 ns 0.97
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 7626499.5 ns 7630583 ns 1.00
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/2 thread(s) 522916.5 ns 519541 ns 1.01
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/4 thread(s) 390125.5 ns 425396 ns 0.92
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/8 thread(s) 3390917 ns 2668499.5 ns 1.27
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/1 thread(s) 89292 ns 88542 ns 1.01
Dense(128 => 128, gelu)(128 x 128)/forward/GPU/CUDA 28324 ns 27675 ns 1.02
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/2 thread(s) 380812.5 ns 379958 ns 1.00
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/4 thread(s) 444875 ns 444000 ns 1.00
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/8 thread(s) 5040083.5 ns 4753729.5 ns 1.06
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/1 thread(s) 259041 ns 258583 ns 1.00
Dense(128 => 128, gelu)(128 x 128)/zygote/GPU/CUDA 219450.5 ns 218086 ns 1.01
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/2 thread(s) 411083 ns 413521 ns 0.99
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/4 thread(s) 475270.5 ns 474959 ns 1.00
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/8 thread(s) 4889250 ns 4525333 ns 1.08
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/1 thread(s) 271084 ns 273666 ns 0.99
Dense(128 => 128, relu)(128 x 128)/forward/CPU/2 thread(s) 465208.5 ns 466645.5 ns 1.00
Dense(128 => 128, relu)(128 x 128)/forward/CPU/4 thread(s) 318584 ns 359083.5 ns 0.89
Dense(128 => 128, relu)(128 x 128)/forward/CPU/8 thread(s) 778771 ns 903250 ns 0.86
Dense(128 => 128, relu)(128 x 128)/forward/CPU/1 thread(s) 54354.5 ns 53375 ns 1.02
Dense(128 => 128, relu)(128 x 128)/forward/GPU/CUDA 28220 ns 27956 ns 1.01
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/2 thread(s) 340333 ns 339958.5 ns 1.00
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/4 thread(s) 341958 ns 341541 ns 1.00
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/8 thread(s) 734125 ns 663834 ns 1.11
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/1 thread(s) 151417 ns 151770.5 ns 1.00
Dense(128 => 128, relu)(128 x 128)/zygote/GPU/CUDA 205814.5 ns 204249 ns 1.01
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/2 thread(s) 351792 ns 354416 ns 0.99
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/4 thread(s) 356604.5 ns 356167 ns 1.00
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/8 thread(s) 935583 ns 638708.5 ns 1.46
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/1 thread(s) 151000 ns 151083 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 606312458 ns 601408417 ns 1.01
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 430997020.5 ns 429624958.5 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 382921125 ns 381216438 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 871105000 ns 871782875 ns 1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7038469 ns 7027859.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 2005974042 ns 1999886687.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 1610239562.5 ns 1620871562.5 ns 0.99
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 1558401520.5 ns 1551986813 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 2631627625 ns 2627061917 ns 1.00
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26000726 ns 26164340 ns 0.99
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/2 thread(s) 539604 ns 532083 ns 1.01
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/4 thread(s) 396875 ns 394416.5 ns 1.01
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/8 thread(s) 3106167 ns 2876833 ns 1.08
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/1 thread(s) 866292 ns 866292 ns 1
Dense(512 => 512, gelu)(512 x 128)/forward/GPU/CUDA 47775 ns 47203 ns 1.01
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/2 thread(s) 1813250 ns 1837916 ns 0.99
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/4 thread(s) 1736667 ns 1743166.5 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/8 thread(s) 16480542 ns 16426958 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/1 thread(s) 2648000 ns 2767000 ns 0.96
Dense(512 => 512, gelu)(512 x 128)/zygote/GPU/CUDA 246886 ns 245777 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/2 thread(s) 1867042 ns 1958146 ns 0.95
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/4 thread(s) 1816500 ns 1839417 ns 0.99
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/8 thread(s) 16523458 ns 16382583 ns 1.01
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/1 thread(s) 2741770.5 ns 2793000 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1439604.5 ns 1483854.5 ns 0.97
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 934625 ns 1015145.5 ns 0.92
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1053375.5 ns 1027417 ns 1.03
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2331625 ns 2341250 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 580680 ns 585975.5 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 5896895.5 ns 5894542 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 8530979 ns 8413959 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 26479875.5 ns 25732583.5 ns 1.03
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 7269958 ns 7339459 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 1365923.5 ns 1337532.5 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 11687917 ns 11684084 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 18462792 ns 18235479.5 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 39354708.5 ns 38621167 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 9551562.5 ns 9550958 ns 1.00
Dense(16 => 16, relu)(16 x 128)/forward/CPU/2 thread(s) 4541.5 ns 2854.5 ns 1.59
Dense(16 => 16, relu)(16 x 128)/forward/CPU/4 thread(s) 3000 ns 2542 ns 1.18
Dense(16 => 16, relu)(16 x 128)/forward/CPU/8 thread(s) 3333 ns 3458 ns 0.96
Dense(16 => 16, relu)(16 x 128)/forward/CPU/1 thread(s) 4750 ns 2833 ns 1.68
Dense(16 => 16, relu)(16 x 128)/forward/GPU/CUDA 25041 ns 24549 ns 1.02
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/2 thread(s) 7333.5 ns 7208.5 ns 1.02
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/4 thread(s) 7208 ns 7292 ns 0.99
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/8 thread(s) 7187.5 ns 7312.5 ns 0.98
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/1 thread(s) 7208 ns 7166 ns 1.01
Dense(16 => 16, relu)(16 x 128)/zygote/GPU/CUDA 213760.5 ns 208459 ns 1.03
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/2 thread(s) 8500 ns 8416 ns 1.01
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/4 thread(s) 8333 ns 8583 ns 0.97
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/8 thread(s) 8459 ns 8542 ns 0.99
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/1 thread(s) 6167 ns 6042 ns 1.02
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/2 thread(s) 10375 ns 10604 ns 0.98
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/4 thread(s) 13833 ns 13125 ns 1.05
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/8 thread(s) 11229.5 ns 11375 ns 0.99
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/1 thread(s) 9250 ns 7750 ns 1.19
Dense(16 => 16, gelu)(16 x 128)/forward/GPU/CUDA 25667 ns 24844 ns 1.03
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/2 thread(s) 20041 ns 19833 ns 1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/4 thread(s) 19917 ns 20250 ns 0.98
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/8 thread(s) 20083 ns 20125 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/1 thread(s) 19584 ns 19875 ns 0.99
Dense(16 => 16, gelu)(16 x 128)/zygote/GPU/CUDA 233795.5 ns 227881 ns 1.03
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/2 thread(s) 23833 ns 23583 ns 1.01
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/4 thread(s) 23541.5 ns 23959 ns 0.98
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/8 thread(s) 23750 ns 23916 ns 0.99
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/1 thread(s) 21333 ns 21250 ns 1.00
Dense(128 => 128, identity)(128 x 128)/forward/CPU/2 thread(s) 28542 ns 28750 ns 0.99
Dense(128 => 128, identity)(128 x 128)/forward/CPU/4 thread(s) 28542 ns 28625 ns 1.00
Dense(128 => 128, identity)(128 x 128)/forward/CPU/8 thread(s) 28750 ns 28834 ns 1.00
Dense(128 => 128, identity)(128 x 128)/forward/CPU/1 thread(s) 46083 ns 46792 ns 0.98
Dense(128 => 128, identity)(128 x 128)/forward/GPU/CUDA 26413 ns 25949 ns 1.02
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/2 thread(s) 227625 ns 227833.5 ns 1.00
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/4 thread(s) 277333 ns 271458 ns 1.02
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/8 thread(s) 3752584 ns 4319479 ns 0.87
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/1 thread(s) 145792 ns 145375 ns 1.00
Dense(128 => 128, identity)(128 x 128)/zygote/GPU/CUDA 215287 ns 205742 ns 1.05
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/2 thread(s) 246083 ns 247458.5 ns 0.99
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/4 thread(s) 294959 ns 289334 ns 1.02
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/8 thread(s) 4140167 ns 4121125 ns 1.00
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/1 thread(s) 145458 ns 145417 ns 1.00
Dense(16 => 16, identity)(16 x 128)/forward/CPU/2 thread(s) 3875 ns 2250 ns 1.72
Dense(16 => 16, identity)(16 x 128)/forward/CPU/4 thread(s) 1792 ns 2125 ns 0.84
Dense(16 => 16, identity)(16 x 128)/forward/CPU/8 thread(s) 2291.5 ns 2292 ns 1.00
Dense(16 => 16, identity)(16 x 128)/forward/CPU/1 thread(s) 1958 ns 2084 ns 0.94
Dense(16 => 16, identity)(16 x 128)/forward/GPU/CUDA 23326 ns 22754 ns 1.03
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/2 thread(s) 5333 ns 5458 ns 0.98
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/4 thread(s) 5125 ns 5209 ns 0.98
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/8 thread(s) 5250 ns 5375 ns 0.98
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/1 thread(s) 5125 ns 5250 ns 0.98
Dense(16 => 16, identity)(16 x 128)/zygote/GPU/CUDA 246332 ns 246015 ns 1.00
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/2 thread(s) 7625 ns 7625 ns 1
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/4 thread(s) 7416 ns 7584 ns 0.98
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/8 thread(s) 7770.5 ns 7625 ns 1.02
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/1 thread(s) 5250 ns 5209 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 80124625 ns 79946667 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 47921000 ns 47907313 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 43331166.5 ns 43314937.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 151470167 ns 151418166 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2687344 ns 2713892 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 672319791 ns 607476458 ns 1.11
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 413871833 ns 410727459 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 397456333.5 ns 397021709 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 687252833 ns 682137042 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 14598552.5 ns 14584860 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 695248479.5 ns 713299583 ns 0.97
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 677318208 ns 679401291 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 996212291 ns 1004795584 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 997847458 ns 1000525250 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.