From f518d890ece4d028e7a336f62c2e0bcf709570ce Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Fri, 8 Mar 2024 11:14:53 +0100 Subject: [PATCH 1/6] adding type check for kwargs --- ext/WeightInitializersCUDAExt.jl | 2 ++ src/initializers.jl | 13 ++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/ext/WeightInitializersCUDAExt.jl b/ext/WeightInitializersCUDAExt.jl index 45b91df..c55e36f 100644 --- a/ext/WeightInitializersCUDAExt.jl +++ b/ext/WeightInitializersCUDAExt.jl @@ -27,6 +27,7 @@ function sparse_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization.")) end + std = std isa T ? std : convert(T, std) rows, cols = dims prop_zero = min(1.0, sparsity) num_zeros = ceil(Integer, prop_zero * rows) @@ -38,6 +39,7 @@ end function identity_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; gain::Number=1, shift::Integer=0) where {T <: Number} + gain = gain isa T ? gain : convert(T, gain) if length(dims) == 1 # Bias initialization return CUDA.zeros(T, dims...) diff --git a/src/initializers.jl b/src/initializers.jl index 357b41c..84b3302 100644 --- a/src/initializers.jl +++ b/src/initializers.jl @@ -36,7 +36,8 @@ artificial intelligence and statistics_. 2010. """ function glorot_uniform(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=1) where {T <: Number} - scale = T(gain) * sqrt(T(24) / sum(_nfan(dims...))) + gain = gain isa T ? gain : convert(T, gain) + scale = gain * sqrt(T(24) / sum(_nfan(dims...))) return (rand(rng, T, dims...) .- T(1 // 2)) .* scale end @@ -56,6 +57,7 @@ artificial intelligence and statistics_. 2010. """ function glorot_normal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=1) where {T <: Number} + gain = gain isa T ? gain : convert(T, gain) std = T(gain) * sqrt(T(2) / sum(_nfan(dims...))) return randn(rng, T, dims...) .* std end @@ -75,6 +77,7 @@ vision_. 2015. """ function kaiming_uniform(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=√T(2)) where {T <: Number} + gain = gain isa T ? gain : convert(T, gain) bound = √T(3) * gain / sqrt(T(first(_nfan(dims...)))) return (rand(rng, T, dims...) .- T(1 // 2)) .* 2 * bound end @@ -94,6 +97,7 @@ vision_. 2015. """ function kaiming_normal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=√T(2)) where {T <: Number} + gain = gain isa T ? gain : convert(T, gain) std = gain / sqrt(T(first(_nfan(dims...)))) return randn(rng, T, dims...) .* std end @@ -111,6 +115,10 @@ function truncated_normal(rng::AbstractRNG, ::Type{T}, dims::Integer...; mean=T( if (mean < lo - 2 * std) || (mean > hi + 2 * std) @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." end + mean = mean isa T ? mean : convert(T, mean) + std = std isa T ? std : convert(T, std) + lo = lo isa T ? lo : convert(T, lo) + hi = hi isa T ? hi : convert(T, hi) l = _norm_cdf((lo - mean) / std) u = _norm_cdf((hi - mean) / std) xs = rand(rng, T, dims...) @@ -153,6 +161,7 @@ ICLR 2014, https://arxiv.org/abs/1312.6120 function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=T(1.0)) where {T <: Number} @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed" + gain = gain isa T ? gain : convert(T, gain) if length(dims) == 2 rows, cols = dims @@ -233,6 +242,7 @@ function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization.")) end + std = std isa T ? std : convert(T, std) rows, cols = dims prop_zero = min(1.0, sparsity) num_zeros = ceil(Integer, prop_zero * rows) @@ -305,6 +315,7 @@ identity_tensor = identity_init(MersenneTwister(123), """ function identity_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=1, shift::Integer=0) where {T <: Number} + gain = gain isa T ? gain : convert(T, gain) if length(dims) == 1 # Bias initialization return zeros(T, dims...) From 213d65e9091f03dd715c2553ce1122e9bcb662a3 Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Fri, 8 Mar 2024 12:07:03 +0100 Subject: [PATCH 2/6] added tests --- test/runtests.jl | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index a2afe08..aca13c8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -114,6 +114,20 @@ const GROUP = get(ENV, "GROUP", "All") @test eltype(cl(rng, 4, 2)) == Float32 end + @testset "Kwargs types" for T in ( + Float16, Float32, Float64, ComplexF16, ComplexF32, ComplexF64) + if (T <: Real) + @test eltype(truncated_normal(T, 2, 5; mean=0, std=1, lo=-2, hi=2)) == T + @test eltype(orthogonal(T, 2, 5; gain=1.0)) == T + end + @test eltype(glorot_uniform(T, 2, 5; gain=1.0)) == T + @test eltype(glorot_normal(T, 2, 5; gain=1.0)) == T + @test eltype(kaiming_uniform(T, 2, 5; gain=sqrt(2))) == T + @test eltype(kaiming_normal(T, 2, 5; gain=sqrt(2))) == T + @test eltype(identity_init(T, 2, 5; gain=1.0)) == T + @test eltype(sparse_init(T, 2, 5; sparsity=0.5, std=0.01)) == T + end + @testset "kaiming" begin # kaiming_uniform should yield a kernel in range [-sqrt(6/n_out), sqrt(6/n_out)] # and kaiming_normal should yield a kernel with stddev ~= sqrt(2/n_out) From c200fd90bc057799e7504b29baaae268d66697be Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Fri, 8 Mar 2024 13:17:30 +0100 Subject: [PATCH 3/6] version bump --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 97d73c1..67384d9 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "WeightInitializers" uuid = "d49dbf32-c5c2-4618-8acc-27bb2598ef2d" authors = ["Avik Pal and contributors"] -version = "0.1.6" +version = "0.1.7" [deps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" From dbe430f723a9b43a7221d7068fcba00974d44f7e Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Sat, 9 Mar 2024 16:42:50 +0100 Subject: [PATCH 4/6] rm check in cuda identity_init --- ext/WeightInitializersCUDAExt.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/ext/WeightInitializersCUDAExt.jl b/ext/WeightInitializersCUDAExt.jl index c55e36f..d7815da 100644 --- a/ext/WeightInitializersCUDAExt.jl +++ b/ext/WeightInitializersCUDAExt.jl @@ -39,7 +39,6 @@ end function identity_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; gain::Number=1, shift::Integer=0) where {T <: Number} - gain = gain isa T ? gain : convert(T, gain) if length(dims) == 1 # Bias initialization return CUDA.zeros(T, dims...) From 089c3d6e260fc9660e3198f34392971064ba98e6 Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Sun, 10 Mar 2024 10:00:13 +0100 Subject: [PATCH 5/6] more straightforward checks --- ext/WeightInitializersCUDAExt.jl | 7 +++---- src/initializers.jl | 31 ++++++++++--------------------- 2 files changed, 13 insertions(+), 25 deletions(-) diff --git a/ext/WeightInitializersCUDAExt.jl b/ext/WeightInitializersCUDAExt.jl index d7815da..ac07b42 100644 --- a/ext/WeightInitializersCUDAExt.jl +++ b/ext/WeightInitializersCUDAExt.jl @@ -27,11 +27,10 @@ function sparse_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization.")) end - std = std isa T ? std : convert(T, std) rows, cols = dims prop_zero = min(1.0, sparsity) num_zeros = ceil(Integer, prop_zero * rows) - sparse_array = randn(rng, T, dims...) .* std + sparse_array = randn(rng, T, dims...) .* T(std) sparse_array[1:num_zeros, :] .= CUDA.zero(T) return CUDA.@allowscalar mapslices(shuffle, sparse_array, dims=1) @@ -47,7 +46,7 @@ function identity_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; rows, cols = dims mat = CUDA.zeros(T, rows, cols) diag_indices = 1:min(rows, cols) - CUDA.fill!(view(mat, diag_indices, diag_indices), gain) + CUDA.fill!(view(mat, diag_indices, diag_indices), T(gain)) return CUDA.circshift(mat, shift) else # Convolution or more dimensions @@ -57,7 +56,7 @@ function identity_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; #we should really find a better way to do this CUDA.@allowscalar for i in 1:min(nin, nout) index = (centers..., i, i) - weights[index...] = gain + weights[index...] = T(gain) end return CUDA.circshift(weights, (ntuple(d -> 0, length(dims) - 2)..., shift, shift)) end diff --git a/src/initializers.jl b/src/initializers.jl index 84b3302..0ed0687 100644 --- a/src/initializers.jl +++ b/src/initializers.jl @@ -36,8 +36,7 @@ artificial intelligence and statistics_. 2010. """ function glorot_uniform(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=1) where {T <: Number} - gain = gain isa T ? gain : convert(T, gain) - scale = gain * sqrt(T(24) / sum(_nfan(dims...))) + scale = T(gain) * sqrt(T(24) / sum(_nfan(dims...))) return (rand(rng, T, dims...) .- T(1 // 2)) .* scale end @@ -57,7 +56,6 @@ artificial intelligence and statistics_. 2010. """ function glorot_normal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=1) where {T <: Number} - gain = gain isa T ? gain : convert(T, gain) std = T(gain) * sqrt(T(2) / sum(_nfan(dims...))) return randn(rng, T, dims...) .* std end @@ -77,8 +75,7 @@ vision_. 2015. """ function kaiming_uniform(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=√T(2)) where {T <: Number} - gain = gain isa T ? gain : convert(T, gain) - bound = √T(3) * gain / sqrt(T(first(_nfan(dims...)))) + bound = √T(3) * T(gain) / sqrt(T(first(_nfan(dims...)))) return (rand(rng, T, dims...) .- T(1 // 2)) .* 2 * bound end @@ -97,8 +94,7 @@ vision_. 2015. """ function kaiming_normal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=√T(2)) where {T <: Number} - gain = gain isa T ? gain : convert(T, gain) - std = gain / sqrt(T(first(_nfan(dims...)))) + std = T(gain) / sqrt(T(first(_nfan(dims...)))) return randn(rng, T, dims...) .* std end @@ -115,17 +111,13 @@ function truncated_normal(rng::AbstractRNG, ::Type{T}, dims::Integer...; mean=T( if (mean < lo - 2 * std) || (mean > hi + 2 * std) @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." end - mean = mean isa T ? mean : convert(T, mean) - std = std isa T ? std : convert(T, std) - lo = lo isa T ? lo : convert(T, lo) - hi = hi isa T ? hi : convert(T, hi) - l = _norm_cdf((lo - mean) / std) - u = _norm_cdf((hi - mean) / std) + l = _norm_cdf((T(lo) - T(mean)) / T(std)) + u = _norm_cdf((T(hi) - T(mean)) / T(std)) xs = rand(rng, T, dims...) broadcast!(xs, xs) do x x = x * 2(u - l) + (2l - 1) x = erfinv(x) - return clamp(x * std * √2 + mean, lo, hi) + return clamp(x * T(std) * √2 + T(mean), T(lo), T(hi)) end return xs end @@ -161,7 +153,6 @@ ICLR 2014, https://arxiv.org/abs/1312.6120 function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=T(1.0)) where {T <: Number} @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed" - gain = gain isa T ? gain : convert(T, gain) if length(dims) == 2 rows, cols = dims @@ -171,7 +162,7 @@ function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; end if rows < cols - return permutedims(orthogonal(rng, T, cols, rows; gain)) + return permutedims(orthogonal(rng, T, cols, rows; T(gain))) end mat = randn(rng, T, rows, cols) @@ -242,11 +233,10 @@ function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization.")) end - std = std isa T ? std : convert(T, std) rows, cols = dims prop_zero = min(1.0, sparsity) num_zeros = ceil(Integer, prop_zero * rows) - sparse_array = randn(rng, T, dims...) .* std + sparse_array = randn(rng, T, dims...) .* T(std) sparse_array[1:num_zeros, :] .= zero(T) return mapslices(shuffle, sparse_array; dims=1) end @@ -315,7 +305,6 @@ identity_tensor = identity_init(MersenneTwister(123), """ function identity_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=1, shift::Integer=0) where {T <: Number} - gain = gain isa T ? gain : convert(T, gain) if length(dims) == 1 # Bias initialization return zeros(T, dims...) @@ -324,7 +313,7 @@ function identity_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; rows, cols = dims mat = zeros(T, rows, cols) for i in 1:min(rows, cols) - mat[i, i] = gain + mat[i, i] = T(gain) end return circshift(mat, shift) else @@ -334,7 +323,7 @@ function identity_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; weights = zeros(T, dims...) for i in 1:min(nin, nout) index = (centers..., i, i) - weights[index...] = gain + weights[index...] = T(gain) end return circshift(weights, (ntuple(d -> 0, length(dims) - 2)..., shift, shift)) end From ed1bddfea215dc51da7adbc5e08e40ef6ca62551 Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Sun, 10 Mar 2024 16:27:01 +0100 Subject: [PATCH 6/6] fixed orthogonal call --- src/initializers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/initializers.jl b/src/initializers.jl index 0ed0687..fd31046 100644 --- a/src/initializers.jl +++ b/src/initializers.jl @@ -162,7 +162,7 @@ function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; end if rows < cols - return permutedims(orthogonal(rng, T, cols, rows; T(gain))) + return permutedims(orthogonal(rng, T, cols, rows; gain=T(gain))) end mat = randn(rng, T, rows, cols)