From b888479d528865c0baf38fc2295b3072b582a584 Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Thu, 18 Jan 2024 14:11:37 +0100 Subject: [PATCH 01/12] rebase adding orthogonal --- Project.toml | 2 ++ src/WeightInitializers.jl | 2 ++ src/initializers.jl | 36 +++++++++++++++++++++++++++++++++++- test/runtests.jl | 33 +++++++++++++++++++++++++++------ 4 files changed, 66 insertions(+), 7 deletions(-) diff --git a/Project.toml b/Project.toml index a71f74f..06d33e8 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,8 @@ version = "0.1.5" [deps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930" PartialFunctions = "570af359-4316-4cb7-8c74-252c00c2016b" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/src/WeightInitializers.jl b/src/WeightInitializers.jl index 446fa8f..869b5b6 100644 --- a/src/WeightInitializers.jl +++ b/src/WeightInitializers.jl @@ -1,6 +1,7 @@ module WeightInitializers import PrecompileTools: @recompile_invalidations +using PartialFunctions, Random, SpecialFunctions, Statistics, LinearAlgebra @recompile_invalidations begin using ChainRulesCore, PartialFunctions, Random, SpecialFunctions, Statistics @@ -25,5 +26,6 @@ export zerosC64, onesC64, randC64, randnC64, zerosC32, onesC32, randC32, randnC3 export glorot_normal, glorot_uniform export kaiming_normal, kaiming_uniform export truncated_normal +export orthogonal end diff --git a/src/initializers.jl b/src/initializers.jl index ec9900d..7e10893 100644 --- a/src/initializers.jl +++ b/src/initializers.jl @@ -122,9 +122,43 @@ function truncated_normal(rng::AbstractRNG, ::Type{T}, dims::Integer...; mean=T( return xs end +""" + orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain = 1) where {T <: Real} -> AbstractArray{T, length(dims)} + orthogonal(rng::AbstractRNG; kw...) -> Function + +Return an `AbstractArray{T}` of the given dimensions (`dims`) which is a (semi) orthogonal matrix, as described in [^Saxe14] + +The function constructs an orthogonal or semi-orthogonal matrix depending on the specified dimensions. For two dimensions, it returns a matrix where `dims = (rows, cols)`. For more than two dimensions, it computes an orthogonal matrix of size `prod(dims[1:(end - 1)])` by `dims[end]` before reshaping it to the original dimensions. + +Cannot construct a vector, i.e., `length(dims) == 1` is forbidden. + +# Arguments + + - `rng::AbstractRNG`: Random number generator. + - `T::Type{<:Real}`: The type of the elements in the array. + - `dims::Integer...`: The dimensions of the array. + - `gain::Number`: Scaling factor for the elements of the orthogonal matrix. + +# References + +[^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120 +""" +function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; + gain::Number=1) where {T <: Real} + @assert length(dims) > 1 "Creating vectors (length(dims) == 1) is not allowed" + rows, cols = dims + if rows < cols + return permutedims(orthogonal(rng, T, cols, rows; gain)) + end + mat = randn(rng, T, rows, cols) + Q, R = LinearAlgebra.qr(mat) + mat .= Array(Q) * sign.(LinearAlgebra.Diagonal(R)) .* T(gain) + return mat +end + # Default Fallbacks for all functions for initializer in (:glorot_uniform, :glorot_normal, :kaiming_uniform, :kaiming_normal, - :truncated_normal) + :truncated_normal, :orthogonal) NType = ifelse(initializer === :truncated_normal, Real, Number) @eval function ($initializer)(dims::Integer...; kwargs...) return $initializer(_default_rng(), Float32, dims...; kwargs...) diff --git a/test/runtests.jl b/test/runtests.jl index 4b4c595..061a809 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -32,7 +32,8 @@ const GROUP = get(ENV, "GROUP", "All") @testset "rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in rngs_arrtypes @testset "Sizes and Types: $init" for init in [zeros32, ones32, rand32, randn32, - kaiming_uniform, kaiming_normal, glorot_uniform, glorot_normal, truncated_normal + kaiming_uniform, kaiming_normal, glorot_uniform, glorot_normal, + truncated_normal, orthogonal, ] # Sizes @test size(init(3)) == (3,) @@ -77,8 +78,7 @@ const GROUP = get(ENV, "GROUP", "All") @testset "AbstractArray Type: $init $T" for init in [kaiming_uniform, kaiming_normal, - glorot_uniform, glorot_normal, truncated_normal], - T in (Float16, Float32, + glorot_uniform, glorot_normal, truncated_normal, orthogonal], T in (Float16, Float32, Float64, ComplexF16, ComplexF32, ComplexF64) init === truncated_normal && !(T <: Real) && continue @@ -98,11 +98,16 @@ const GROUP = get(ENV, "GROUP", "All") end @testset "Closure: $init" for init in [kaiming_uniform, kaiming_normal, - glorot_uniform, glorot_normal, truncated_normal] + glorot_uniform, glorot_normal, truncated_normal, orthogonal] cl = init(;) # Sizes - @test size(cl(3)) == (3,) - @test size(cl(rng, 3)) == (3,) + if init == orthogonal + @test_throws AssertionError cl(3) + @test_throws AssertionError cl(rng, 3) + else + @test size(cl(3)) == (3,) + @test size(cl(rng, 3)) == (3,) + end @test size(cl(3, 4)) == (3, 4) @test size(cl(rng, 3, 4)) == (3, 4) @test size(cl(3, 4, 5)) == (3, 4, 5) @@ -141,6 +146,22 @@ const GROUP = get(ENV, "GROUP", "All") end @test eltype(init(3, 4; gain=1.5)) == Float32 end + + @testset "orthogonal" begin + # A matrix of dim = (m,n) with m > n should produce a QR decomposition. In the other case, the transpose should be taken to compute the QR decomposition. + for (rows, cols) in [(5, 3), (3, 5)] + v = orthogonal(rows, cols) + rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols)) + end + for mat in [(3, 4, 5), (2, 2, 5)] + v = orthogonal(mat...) + cols = mat[end] + rows = div(prod(mat), cols) + v = reshape(v, (rows, cols)) + rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols)) + end + @test eltype(orthogonal(3, 4; gain=1.5)) == Float32 + end end @testset "Warning: truncated_normal" begin From 867f09a84e58a89ffe182d174e0815dd13c656c8 Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Sat, 20 Jan 2024 18:21:55 +0100 Subject: [PATCH 02/12] fixing orthogonal --- src/initializers.jl | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/initializers.jl b/src/initializers.jl index 7e10893..4c9f13c 100644 --- a/src/initializers.jl +++ b/src/initializers.jl @@ -143,17 +143,29 @@ Cannot construct a vector, i.e., `length(dims) == 1` is forbidden. [^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120 """ -function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; - gain::Number=1) where {T <: Real} +function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=T(1)) where {T <: Real} @assert length(dims) > 1 "Creating vectors (length(dims) == 1) is not allowed" - rows, cols = dims + + if length(dims) == 2 + rows, cols = dims + else + rows = prod(dims[1:end-1]) + cols = dims[end] + end + if rows < cols return permutedims(orthogonal(rng, T, cols, rows; gain)) end + mat = randn(rng, T, rows, cols) Q, R = LinearAlgebra.qr(mat) mat .= Array(Q) * sign.(LinearAlgebra.Diagonal(R)) .* T(gain) - return mat + + if length(dims) > 2 + return reshape(mat, dims) + else + return mat + end end # Default Fallbacks for all functions From b56a09d3082b878ecaed93eb72f896f846c6a83f Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Thu, 1 Feb 2024 21:31:03 +0100 Subject: [PATCH 03/12] rebase added identity_init, sparse_init --- ext/WeightInitializersCUDAExt.jl | 61 ++++++++++++- src/WeightInitializers.jl | 2 + src/initializers.jl | 149 +++++++++++++++++++++++++++++-- test/Project.toml | 11 +++ test/runtests.jl | 22 ++--- 5 files changed, 225 insertions(+), 20 deletions(-) create mode 100644 test/Project.toml diff --git a/ext/WeightInitializersCUDAExt.jl b/ext/WeightInitializersCUDAExt.jl index 4d6e365..eb04364 100644 --- a/ext/WeightInitializersCUDAExt.jl +++ b/ext/WeightInitializersCUDAExt.jl @@ -1,7 +1,7 @@ module WeightInitializersCUDAExt using WeightInitializers, CUDA -import WeightInitializers: __partial_apply, NUM_TO_FPOINT +import WeightInitializers: __partial_apply, NUM_TO_FPOINT, identity_init, sparse_init const AbstractCuRNG = Union{CUDA.RNG, CURAND.RNG} @@ -19,4 +19,63 @@ for T in ("16", "32", "64", "C16", "C32", "C64"), fname in (:ones, :zeros) end end +function identity_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; + gain::Number=1, shift::Integer=0) where {T <: Number} + if length(dims) == 1 + # Bias initialization + return CUDA.zeros(T, dims...) + elseif length(dims) == 2 + # Matrix multiplication + rows, cols = dims + mat = CUDA.zeros(T, rows, cols) + diag_indices = 1:min(rows, cols) + CUDA.fill!(view(mat, diag_indices, diag_indices), gain) + return CUDA.circshift(mat, shift) + else + # Convolution or more dimensions + nin, nout = dims[end - 1], dims[end] + centers = map(d -> cld(d, 2), dims[1:(end - 2)]) + weights = CUDA.zeros(T, dims...) + #we should really find a better way to do this + CUDA.@allowscalar for i in 1:min(nin, nout) + index = (centers..., i, i) + weights[index...] = gain + end + return CUDA.circshift(weights, (ntuple(d -> 0, length(dims) - 2)..., shift, shift)) + end +end + +function sparse_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; + sparsity::Number, std::Number=T(0.01)) where {T <: Number} + if length(dims) != 2 + throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization.")) + end + + rows, cols = dims + prop_zero = min(1.0, sparsity) + num_zeros = ceil(Integer, prop_zero * rows) + sparse_array = randn(rng, T, dims...) .* std + sparse_array[1:num_zeros, :] .= CUDA.zero(T) + + for col in 1:cols + sparse_array[:, col] = CUDA.shuffle(rng, sparse_array[:, col]) + end + + return sparse_array +end + +for initializer in (:sparse_init, :identity_init) + @eval function ($initializer)(rng::AbstractCuRNG, dims::Integer...; kwargs...) + return $initializer(rng, Float32, dims...; kwargs...) + end + + @eval function ($initializer)(rng::AbstractCuRNG; kwargs...) + return __partial_apply($initializer, (rng, (; kwargs...))) + end + @eval function ($initializer)(rng::AbstractCuRNG, + ::Type{T}; kwargs...) where {T <: Number} + return __partial_apply($initializer, ((rng, T), (; kwargs...))) + end +end + end diff --git a/src/WeightInitializers.jl b/src/WeightInitializers.jl index 869b5b6..b2db3cb 100644 --- a/src/WeightInitializers.jl +++ b/src/WeightInitializers.jl @@ -27,5 +27,7 @@ export glorot_normal, glorot_uniform export kaiming_normal, kaiming_uniform export truncated_normal export orthogonal +export sparse_init +export identity_init end diff --git a/src/initializers.jl b/src/initializers.jl index 4c9f13c..3e1f99a 100644 --- a/src/initializers.jl +++ b/src/initializers.jl @@ -143,20 +143,23 @@ Cannot construct a vector, i.e., `length(dims) == 1` is forbidden. [^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120 """ -function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=T(1)) where {T <: Real} - @assert length(dims) > 1 "Creating vectors (length(dims) == 1) is not allowed" - +function orthogonal(rng::AbstractRNG, + ::Type{T}, + dims::Integer...; + gain::Number=T(1)) where {T <: Real} + @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed" + if length(dims) == 2 rows, cols = dims else - rows = prod(dims[1:end-1]) + rows = prod(dims[1:(end - 1)]) cols = dims[end] end if rows < cols return permutedims(orthogonal(rng, T, cols, rows; gain)) end - + mat = randn(rng, T, rows, cols) Q, R = LinearAlgebra.qr(mat) mat .= Array(Q) * sign.(LinearAlgebra.Diagonal(R)) .* T(gain) @@ -168,9 +171,143 @@ function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number= end end +""" + sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; sparsity::Number, std::Number=0.01) where {T <: Number} -> AbstractArray{T} + +Creates a sparsely initialized weight matrix with a specified proportion of zeroed elements, using random numbers drawn from a normal distribution for the non-zero elements. This method is introduced in [^Martens2010]. +Note: The sparsity parameter controls the proportion of the matrix that will be zeroed. For example, a sparsity of 0.3 means that approximately 30% of the elements will be set to zero. The non-zero elements are distributed according to a normal distribution, scaled by the std parameter. + +# Arguments + + - `rng::AbstractRNG`: The random number generator to use. + - `T::Type{<:Number}`: The numeric type of the elements in the returned array. + - `dims::Integer...`: The dimensions of the weight matrix to be generated. + - `sparsity::Number`: The proportion of elements to be zeroed. Must be between 0 and 1. + - `std::Number=0.01`: The standard deviation of the normal distribution before applying `gain`. + +# Returns + + - `AbstractArray{T}`: A sparsely initialized weight matrix of dimensions `dims` and type `T`. + +# Examples + +```julia +using Random + +# Initialize a 5x5 sparsely initialized matrix with 30% sparsity +rng = MersenneTwister(123) +matrix = sparse_init(rng, Float32, 5, 5; sparsity=0.3, std=0.01) +``` + +``` +5×5 Matrix{Float64}: + 0.0 0.00273815 0.00592403 0.0 0.0 + 0.00459416 -0.000754831 -0.00888936 -0.0077507 0.0 + 0.0 -0.00194229 0.0 0.0 -0.00468489 + 0.0114265 0.0 0.0 -0.00734886 0.00277726 + -0.00396679 0.0 0.00327215 -0.0071741 -0.00880897 +``` + +# References + +[^Martens2010] Martens, J, "Deep learning via Hessian-free optimization" _Proceedings of the 27th International Conference on International Conference on Machine Learning_. 2010. +""" +function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; + sparsity::Number, std::Number=T(0.01)) where {T <: Number} + if length(dims) != 2 + throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization.")) + end + + rows, cols = dims + prop_zero = min(1.0, sparsity) + num_zeros = ceil(Integer, prop_zero * rows) + sparse_array = randn(rng, T, dims...) .* std + sparse_array[1:num_zeros, :] .= zero(T) + + for col in 1:cols + sparse_array[:, col] = shuffle(rng, sparse_array[:, col]) + end + + return sparse_array +end + +""" + identity_init(rng::AbstractRNG, ::Type{T}, size...; gain::Number=1, shift::Union{Integer, Tuple{Integer, Integer}}=0) where {T <: Number} -> AbstractArray{T} + +Constructs an array that aims to provide an identity mapping when used as parameters in most layers of a neural network. The identity mapping is scaled by the `gain` parameter. + +# Behavior + + - 1D: Returns a `Vector` of zeros (useful for biases in layers where `input_size == output_size`). + - 2D: Returns an identity matrix (useful for fully connected layers with equal input and output sizes). + - More than 2D: Returns a tensor where the central slice along the last two dimensions is an identity matrix, and the rest are zeros (useful for convolutional layers, simulating an identity convolution). + +# Caveats + + - Not all layers will result in an identity mapping when using this initializer. Exceptions include recurrent and normalization layers. + - Layers must have `input_size == output_size` for a perfect identity mapping. In cases where this condition is not met, the function pads extra dimensions with zeros. + - For convolutional layers to achieve an identity mapping, kernel sizes must be odd, and appropriate padding must be applied to ensure the output feature maps are the same size as the input feature maps. + +# Arguments + + - `rng::AbstractRNG`: An optional random number generator, included for consistency with other initializers but ignored since the output is deterministic. + - `T::Type{<:Number}`: The numeric type of the array elements. + - `size...`: The dimensions of the array to be initialized. + - `gain::Number=1`: A scaling factor applied to the identity mapping. + - `shift::Union{Integer, Tuple{Integer, Integer}}=0`: An integer or a tuple specifying the circular shift applied to the output array. + +# Returns + + - `AbstractArray{T}`: An array initialized to represent an identity mapping, scaled by `gain` and optionally shifted by `shift`. + +# Examples + +```julia +using Random + +# Identity matrix for fully connected layer +identity_matrix = identity_init(MersenneTwister(123), Float32, 5, 5) + +# Identity tensor for convolutional layer +identity_tensor = identity_init(MersenneTwister(123), + Float32, # Bias initialization + 3, + 3, + 5, # Matrix multiplication + 5; + gain=1.5, + shift=(1, 0)) +``` +""" +function identity_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; + gain::Number=1, shift::Integer=0) where {T <: Number} + if length(dims) == 1 + # Bias initialization + return zeros(T, dims...) + elseif length(dims) == 2 + # Matrix multiplication + rows, cols = dims + mat = zeros(T, rows, cols) + for i in 1:min(rows, cols) + mat[i, i] = gain + end + return circshift(mat, shift) + else + # Convolution or more dimensions + nin, nout = dims[end - 1], dims[end] + centers = map(d -> cld(d, 2), dims[1:(end - 2)]) + weights = zeros(T, dims...) + for i in 1:min(nin, nout) + index = (centers..., i, i) + weights[index...] = gain + end + return circshift(weights, (ntuple(d -> 0, length(dims) - 2)..., shift, shift)) + end +end + # Default Fallbacks for all functions for initializer in (:glorot_uniform, :glorot_normal, :kaiming_uniform, :kaiming_normal, - :truncated_normal, :orthogonal) + :truncated_normal, :orthogonal, :sparse_init, :identity_init) NType = ifelse(initializer === :truncated_normal, Real, Number) @eval function ($initializer)(dims::Integer...; kwargs...) return $initializer(_default_rng(), Float32, dims...; kwargs...) diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 0000000..0adcca7 --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,11 @@ +[deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[compat] +julia = "1.6" diff --git a/test/runtests.jl b/test/runtests.jl index 061a809..647e458 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,6 @@ -using Aqua, WeightInitializers, Test, Statistics -using StableRNGs, Random, CUDA +using Aqua +using WeightInitializers, Test, SafeTestsets, Statistics +using StableRNGs, Random, CUDA, LinearAlgebra CUDA.allowscalar(false) @@ -33,7 +34,7 @@ const GROUP = get(ENV, "GROUP", "All") @testset "rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in rngs_arrtypes @testset "Sizes and Types: $init" for init in [zeros32, ones32, rand32, randn32, kaiming_uniform, kaiming_normal, glorot_uniform, glorot_normal, - truncated_normal, orthogonal, + truncated_normal, identity_init, ] # Sizes @test size(init(3)) == (3,) @@ -78,7 +79,7 @@ const GROUP = get(ENV, "GROUP", "All") @testset "AbstractArray Type: $init $T" for init in [kaiming_uniform, kaiming_normal, - glorot_uniform, glorot_normal, truncated_normal, orthogonal], T in (Float16, Float32, + glorot_uniform, glorot_normal, truncated_normal, identity_init], T in (Float16, Float32, Float64, ComplexF16, ComplexF32, ComplexF64) init === truncated_normal && !(T <: Real) && continue @@ -98,16 +99,11 @@ const GROUP = get(ENV, "GROUP", "All") end @testset "Closure: $init" for init in [kaiming_uniform, kaiming_normal, - glorot_uniform, glorot_normal, truncated_normal, orthogonal] + glorot_uniform, glorot_normal, truncated_normal, identity_init] cl = init(;) # Sizes - if init == orthogonal - @test_throws AssertionError cl(3) - @test_throws AssertionError cl(rng, 3) - else - @test size(cl(3)) == (3,) - @test size(cl(rng, 3)) == (3,) - end + @test size(cl(3)) == (3,) + @test size(cl(rng, 3)) == (3,) @test size(cl(3, 4)) == (3, 4) @test size(cl(rng, 3, 4)) == (3, 4) @test size(cl(3, 4, 5)) == (3, 4, 5) @@ -146,7 +142,7 @@ const GROUP = get(ENV, "GROUP", "All") end @test eltype(init(3, 4; gain=1.5)) == Float32 end - + @testset "orthogonal" begin # A matrix of dim = (m,n) with m > n should produce a QR decomposition. In the other case, the transpose should be taken to compute the QR decomposition. for (rows, cols) in [(5, 3), (3, 5)] From ee6fe0037b511f9f8f00d93bade43ce28056131e Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Mon, 12 Feb 2024 18:40:53 +0100 Subject: [PATCH 04/12] rebase test structure for orthogonal, small fixes --- ext/WeightInitializersCUDAExt.jl | 29 ++++++++++++++++++++- src/initializers.jl | 9 +++---- test/runtests.jl | 43 +++++++++++++++++++++++++++++--- 3 files changed, 72 insertions(+), 9 deletions(-) diff --git a/ext/WeightInitializersCUDAExt.jl b/ext/WeightInitializersCUDAExt.jl index eb04364..1137d1f 100644 --- a/ext/WeightInitializersCUDAExt.jl +++ b/ext/WeightInitializersCUDAExt.jl @@ -1,7 +1,7 @@ module WeightInitializersCUDAExt using WeightInitializers, CUDA -import WeightInitializers: __partial_apply, NUM_TO_FPOINT, identity_init, sparse_init +import WeightInitializers: __partial_apply, NUM_TO_FPOINT, identity_init, sparse_init, orthogonal const AbstractCuRNG = Union{CUDA.RNG, CURAND.RNG} @@ -19,6 +19,33 @@ for T in ("16", "32", "64", "C16", "C32", "C64"), fname in (:ones, :zeros) end end +function orthogonal(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; + gain::Number=T(1.0)) where {T <: Number} + @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed" + + if length(dims) == 2 + rows, cols = dims + else + rows = prod(dims[1:(end - 1)]) + cols = dims[end] + end + + if rows < cols + return CUDA.permutedims(orthogonal(rng, T, cols, rows; gain)) + end + + mat = randn(rng, T, rows, cols) + Q, R = CUDA.qr(mat) + mat .= Q * sign.(CUDA.diag(R)) .* T(gain) + + if length(dims) > 2 + return CUDA.reshape(mat, dims) + else + return mat + end +end + + function identity_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; gain::Number=1, shift::Integer=0) where {T <: Number} if length(dims) == 1 diff --git a/src/initializers.jl b/src/initializers.jl index 3e1f99a..c8141ff 100644 --- a/src/initializers.jl +++ b/src/initializers.jl @@ -143,11 +143,10 @@ Cannot construct a vector, i.e., `length(dims) == 1` is forbidden. [^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120 """ -function orthogonal(rng::AbstractRNG, - ::Type{T}, - dims::Integer...; - gain::Number=T(1)) where {T <: Real} - @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed" +function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; + gain::Number=T(1.0)) where {T <: Number} + + @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed" if length(dims) == 2 rows, cols = dims diff --git a/test/runtests.jl b/test/runtests.jl index 647e458..c13ac51 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -160,9 +160,46 @@ const GROUP = get(ENV, "GROUP", "All") end end - @testset "Warning: truncated_normal" begin - @test_warn "Mean is more than 2 std outside the limits in truncated_normal, so \ - the distribution of values may be inaccurate." truncated_normal(2; mean=-5.0f0) + @testset "Orthogonal rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in rngs_arrtypes + # A matrix of dim = (m,n) with m > n should produce a QR decomposition. + # In the other case, the transpose should be taken to compute the QR decomposition. + for (rows, cols) in [(5, 3), (3, 5)] + v = orthogonal(rng, rows, cols) + CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols)) + end + for mat in [(3, 4, 5), (2, 2, 5)] + v = orthogonal(rng, mat...) + cols = mat[end] + rows = div(prod(mat), cols) + v = reshape(v, (rows, cols)) + CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols)) + end + # Type + @testset "Orthogonal Types $T" for T in (Float16, Float32, Float64) + @test eltype(orthogonal(rng, T, 3, 4; gain=1.5)) == T + @test eltype(orthogonal(rng, T, 3, 4, 5; gain=1.5)) == T + end + @testset "Orthogonal AbstractArray Type $T" for T in (Float16, Float32, Float64) + @test orthogonal(T, 3, 5) isa AbstractArray{T, 2} + @test orthogonal(rng, T, 3, 5) isa arrtype{T, 2} + + cl = orthogonal(rng) + @test cl(T, 3, 5) isa arrtype{T, 2} + + cl = orthogonal(rng, T) + @test cl(3, 5) isa arrtype{T, 2} + end + @testset "Orthogonal Closure" begin + cl = orthogonal(;) + # Sizes + @test size(cl(3, 4)) == (3, 4) + @test size(cl(rng, 3, 4)) == (3, 4) + @test size(cl(3, 4, 5)) == (3, 4, 5) + @test size(cl(rng, 3, 4, 5)) == (3, 4, 5) + # Type + @test eltype(cl(4, 2)) == Float32 + @test eltype(cl(rng, 4, 2)) == Float32 + end end @testset "Aqua: Quality Assurance" begin From 215cd5e981a6a954a5ae22daaebdd3bd2a517ca8 Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Tue, 20 Feb 2024 17:33:46 +0100 Subject: [PATCH 05/12] small fixes and finalizing tests --- ext/WeightInitializersCUDAExt.jl | 50 ++++++----------------- src/initializers.jl | 11 ++---- test/runtests.jl | 68 ++++++++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 50 deletions(-) diff --git a/ext/WeightInitializersCUDAExt.jl b/ext/WeightInitializersCUDAExt.jl index 1137d1f..6de1f27 100644 --- a/ext/WeightInitializersCUDAExt.jl +++ b/ext/WeightInitializersCUDAExt.jl @@ -1,6 +1,7 @@ module WeightInitializersCUDAExt using WeightInitializers, CUDA +using Random import WeightInitializers: __partial_apply, NUM_TO_FPOINT, identity_init, sparse_init, orthogonal const AbstractCuRNG = Union{CUDA.RNG, CURAND.RNG} @@ -19,30 +20,20 @@ for T in ("16", "32", "64", "C16", "C32", "C64"), fname in (:ones, :zeros) end end -function orthogonal(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; - gain::Number=T(1.0)) where {T <: Number} - @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed" - if length(dims) == 2 - rows, cols = dims - else - rows = prod(dims[1:(end - 1)]) - cols = dims[end] - end - - if rows < cols - return CUDA.permutedims(orthogonal(rng, T, cols, rows; gain)) +function sparse_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; + sparsity::Number, std::Number=T(0.01)) where {T <: Number} + if length(dims) != 2 + throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization.")) end - mat = randn(rng, T, rows, cols) - Q, R = CUDA.qr(mat) - mat .= Q * sign.(CUDA.diag(R)) .* T(gain) + rows, cols = dims + prop_zero = min(1.0, sparsity) + num_zeros = ceil(Integer, prop_zero * rows) + sparse_array = randn(rng, T, dims...) .* std + sparse_array[1:num_zeros, :] .= CUDA.zero(T) - if length(dims) > 2 - return CUDA.reshape(mat, dims) - else - return mat - end + return CUDA.@allowscalar mapslices(shuffle, sparse_array, dims=1) end @@ -72,25 +63,6 @@ function identity_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; end end -function sparse_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; - sparsity::Number, std::Number=T(0.01)) where {T <: Number} - if length(dims) != 2 - throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization.")) - end - - rows, cols = dims - prop_zero = min(1.0, sparsity) - num_zeros = ceil(Integer, prop_zero * rows) - sparse_array = randn(rng, T, dims...) .* std - sparse_array[1:num_zeros, :] .= CUDA.zero(T) - - for col in 1:cols - sparse_array[:, col] = CUDA.shuffle(rng, sparse_array[:, col]) - end - - return sparse_array -end - for initializer in (:sparse_init, :identity_init) @eval function ($initializer)(rng::AbstractCuRNG, dims::Integer...; kwargs...) return $initializer(rng, Float32, dims...; kwargs...) diff --git a/src/initializers.jl b/src/initializers.jl index c8141ff..2f771cb 100644 --- a/src/initializers.jl +++ b/src/initializers.jl @@ -160,8 +160,8 @@ function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; end mat = randn(rng, T, rows, cols) - Q, R = LinearAlgebra.qr(mat) - mat .= Array(Q) * sign.(LinearAlgebra.Diagonal(R)) .* T(gain) + Q, R = qr(mat) + mat .= Q * sign.(Diagonal(R)) .* T(gain) if length(dims) > 2 return reshape(mat, dims) @@ -222,12 +222,7 @@ function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; num_zeros = ceil(Integer, prop_zero * rows) sparse_array = randn(rng, T, dims...) .* std sparse_array[1:num_zeros, :] .= zero(T) - - for col in 1:cols - sparse_array[:, col] = shuffle(rng, sparse_array[:, col]) - end - - return sparse_array + return mapslices(shuffle, sparse_array, dims=1) end """ diff --git a/test/runtests.jl b/test/runtests.jl index c13ac51..ee797c2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -175,11 +175,11 @@ const GROUP = get(ENV, "GROUP", "All") CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols)) end # Type - @testset "Orthogonal Types $T" for T in (Float16, Float32, Float64) + @testset "Orthogonal Types $T" for T in (Float32, Float64)#(Float16, Float32, Float64) @test eltype(orthogonal(rng, T, 3, 4; gain=1.5)) == T @test eltype(orthogonal(rng, T, 3, 4, 5; gain=1.5)) == T end - @testset "Orthogonal AbstractArray Type $T" for T in (Float16, Float32, Float64) + @testset "Orthogonal AbstractArray Type $T" for T in (Float32, Float64)#(Float16, Float32, Float64) @test orthogonal(T, 3, 5) isa AbstractArray{T, 2} @test orthogonal(rng, T, 3, 5) isa arrtype{T, 2} @@ -202,8 +202,70 @@ const GROUP = get(ENV, "GROUP", "All") end end + @testset "sparse_init rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in rngs_arrtypes + # sparse_init should yield an error for non 2-d dimensions + # sparse_init should yield no zero elements if sparsity < 0 + # sparse_init should yield all zero elements if sparsity > 1 + # sparse_init should yield exactly ceil(n_in * sparsity) elements in each column for other sparsity values + # sparse_init should yield a kernel in its non-zero elements consistent with the std parameter + + @test_throws ArgumentError sparse_init(3, 4, 5, sparsity=0.1) + @test_throws ArgumentError sparse_init(3, sparsity=0.1) + v = sparse_init(100, 100, sparsity=-0.1) + @test sum(v .== 0) == 0 + v = sparse_init(100, 100, sparsity=1.1) + @test sum(v .== 0) == length(v) + + for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)] + expected_zeros = ceil(Integer, n_in * sparsity) + v = sparse_init(n_in, n_out, sparsity=sparsity, std=σ) + @test all([sum(v[:,col] .== 0) == expected_zeros for col in 1:n_out]) + @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ + end + + # Type + @testset "sparse_init Types $T" for T in (Float16, Float32, Float64) + @test eltype(sparse_init(rng, T, 3, 4; sparsity=0.5)) == T + end + @testset "sparse_init AbstractArray Type $T" for T in (Float16, Float32, Float64) + @test sparse_init(T, 3, 5; sparsity=0.5) isa AbstractArray{T, 2} + @test sparse_init(rng, T, 3, 5; sparsity=0.5) isa arrtype{T, 2} + + cl = sparse_init(rng; sparsity=0.5) + @test cl(T, 3, 5) isa arrtype{T, 2} + + cl = sparse_init(rng, T; sparsity=0.5) + @test cl(3, 5) isa arrtype{T, 2} + end + @testset "sparse_init Closure" begin + cl = sparse_init(; sparsity=0.5) + # Sizes + @test size(cl(3, 4)) == (3, 4) + @test size(cl(rng, 3, 4)) == (3, 4) + # Type + @test eltype(cl(4, 2)) == Float32 + @test eltype(cl(rng, 4, 2)) == Float32 + end + end + + @testset "identity_init" begin + @testset "Non-identity sizes" begin + @test identity_init(2, 3)[:, end] == zeros(Float32, 2) + @test identity_init(3, 2; shift=1)[1, :] == zeros(Float32, 2) + @test identity_init(1, 1, 3, 4)[:, :, :, end] == zeros(Float32, 1, 1, 3) + @test identity_init(2, 1, 3, 3)[end, :, :, :] == zeros(Float32, 1, 3, 3) + @test identity_init(1, 2, 3, 3)[:, end, :, :] == zeros(Float32, 1, 3, 3) + end + end + + @static if VERSION ≥ v"1.9" + @testset "Warning: truncated_normal" begin + @test_warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." truncated_normal(2; + mean=-5.0f0) + end + end + @testset "Aqua: Quality Assurance" begin Aqua.test_all(WeightInitializers; ambiguities=false) Aqua.test_ambiguities(WeightInitializers; recursive=false) - end end From 24485b5a468386f7e66cb3c962a8dada7265a974 Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Fri, 23 Feb 2024 21:26:18 +0100 Subject: [PATCH 06/12] small fix --- test/runtests.jl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index ee797c2..4cc13c3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -258,11 +258,9 @@ const GROUP = get(ENV, "GROUP", "All") end end - @static if VERSION ≥ v"1.9" - @testset "Warning: truncated_normal" begin - @test_warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." truncated_normal(2; - mean=-5.0f0) - end + @testset "Warning: truncated_normal" begin + @test_warn "Mean is more than 2 std outside the limits in truncated_normal, so \ + the distribution of values may be inaccurate." truncated_normal(2; mean=-5.0f0) end @testset "Aqua: Quality Assurance" begin From a199793f9014081b158bab5729ac5ae624c516b0 Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Fri, 23 Feb 2024 21:29:07 +0100 Subject: [PATCH 07/12] up version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 06d33e8..444f032 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "WeightInitializers" uuid = "d49dbf32-c5c2-4618-8acc-27bb2598ef2d" authors = ["Avik Pal and contributors"] -version = "0.1.5" +version = "0.1.6" [deps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" From 5cdf8c4b976762d10220b3e2adb46985a5e0e8dd Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Fri, 23 Feb 2024 21:44:17 +0100 Subject: [PATCH 08/12] final fixes --- Project.toml | 2 +- ext/WeightInitializersCUDAExt.jl | 7 +++---- src/initializers.jl | 7 +++---- test/Project.toml | 11 ----------- test/runtests.jl | 24 ++++++++++++++---------- 5 files changed, 21 insertions(+), 30 deletions(-) delete mode 100644 test/Project.toml diff --git a/Project.toml b/Project.toml index 444f032..97d73c1 100644 --- a/Project.toml +++ b/Project.toml @@ -6,7 +6,6 @@ version = "0.1.6" [deps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930" PartialFunctions = "570af359-4316-4cb7-8c74-252c00c2016b" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -23,6 +22,7 @@ WeightInitializersCUDAExt = "CUDA" Aqua = "0.8" CUDA = "5" ChainRulesCore = "1.21" +LinearAlgebra = "1.9" PartialFunctions = "1.2" PrecompileTools = "1.2" Random = "1.9" diff --git a/ext/WeightInitializersCUDAExt.jl b/ext/WeightInitializersCUDAExt.jl index 6de1f27..45b91df 100644 --- a/ext/WeightInitializersCUDAExt.jl +++ b/ext/WeightInitializersCUDAExt.jl @@ -2,7 +2,8 @@ module WeightInitializersCUDAExt using WeightInitializers, CUDA using Random -import WeightInitializers: __partial_apply, NUM_TO_FPOINT, identity_init, sparse_init, orthogonal +import WeightInitializers: __partial_apply, NUM_TO_FPOINT, identity_init, sparse_init, + orthogonal const AbstractCuRNG = Union{CUDA.RNG, CURAND.RNG} @@ -20,9 +21,8 @@ for T in ("16", "32", "64", "C16", "C32", "C64"), fname in (:ones, :zeros) end end - function sparse_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; - sparsity::Number, std::Number=T(0.01)) where {T <: Number} + sparsity::Number, std::Number=T(0.01)) where {T <: Number} if length(dims) != 2 throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization.")) end @@ -36,7 +36,6 @@ function sparse_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; return CUDA.@allowscalar mapslices(shuffle, sparse_array, dims=1) end - function identity_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...; gain::Number=1, shift::Integer=0) where {T <: Number} if length(dims) == 1 diff --git a/src/initializers.jl b/src/initializers.jl index 2f771cb..a35e6da 100644 --- a/src/initializers.jl +++ b/src/initializers.jl @@ -144,9 +144,8 @@ Cannot construct a vector, i.e., `length(dims) == 1` is forbidden. [^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120 """ function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; - gain::Number=T(1.0)) where {T <: Number} - - @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed" + gain::Number=T(1.0)) where {T <: Number} + @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed" if length(dims) == 2 rows, cols = dims @@ -222,7 +221,7 @@ function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; num_zeros = ceil(Integer, prop_zero * rows) sparse_array = randn(rng, T, dims...) .* std sparse_array[1:num_zeros, :] .= zero(T) - return mapslices(shuffle, sparse_array, dims=1) + return mapslices(shuffle, sparse_array; dims=1) end """ diff --git a/test/Project.toml b/test/Project.toml deleted file mode 100644 index 0adcca7..0000000 --- a/test/Project.toml +++ /dev/null @@ -1,11 +0,0 @@ -[deps] -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" -StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" -Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[compat] -julia = "1.6" diff --git a/test/runtests.jl b/test/runtests.jl index 4cc13c3..a2afe08 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,5 @@ using Aqua -using WeightInitializers, Test, SafeTestsets, Statistics +using WeightInitializers, Test, Statistics using StableRNGs, Random, CUDA, LinearAlgebra CUDA.allowscalar(false) @@ -34,7 +34,7 @@ const GROUP = get(ENV, "GROUP", "All") @testset "rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in rngs_arrtypes @testset "Sizes and Types: $init" for init in [zeros32, ones32, rand32, randn32, kaiming_uniform, kaiming_normal, glorot_uniform, glorot_normal, - truncated_normal, identity_init, + truncated_normal, identity_init ] # Sizes @test size(init(3)) == (3,) @@ -79,7 +79,8 @@ const GROUP = get(ENV, "GROUP", "All") @testset "AbstractArray Type: $init $T" for init in [kaiming_uniform, kaiming_normal, - glorot_uniform, glorot_normal, truncated_normal, identity_init], T in (Float16, Float32, + glorot_uniform, glorot_normal, truncated_normal, identity_init], + T in (Float16, Float32, Float64, ComplexF16, ComplexF32, ComplexF64) init === truncated_normal && !(T <: Real) && continue @@ -165,14 +166,16 @@ const GROUP = get(ENV, "GROUP", "All") # In the other case, the transpose should be taken to compute the QR decomposition. for (rows, cols) in [(5, 3), (3, 5)] v = orthogonal(rng, rows, cols) - CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols)) + CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) : + (@test v' * v ≈ I(cols)) end for mat in [(3, 4, 5), (2, 2, 5)] v = orthogonal(rng, mat...) cols = mat[end] rows = div(prod(mat), cols) v = reshape(v, (rows, cols)) - CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols)) + CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) : + (@test v' * v ≈ I(cols)) end # Type @testset "Orthogonal Types $T" for T in (Float32, Float64)#(Float16, Float32, Float64) @@ -211,15 +214,15 @@ const GROUP = get(ENV, "GROUP", "All") @test_throws ArgumentError sparse_init(3, 4, 5, sparsity=0.1) @test_throws ArgumentError sparse_init(3, sparsity=0.1) - v = sparse_init(100, 100, sparsity=-0.1) + v = sparse_init(100, 100; sparsity=-0.1) @test sum(v .== 0) == 0 - v = sparse_init(100, 100, sparsity=1.1) + v = sparse_init(100, 100; sparsity=1.1) @test sum(v .== 0) == length(v) for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)] expected_zeros = ceil(Integer, n_in * sparsity) - v = sparse_init(n_in, n_out, sparsity=sparsity, std=σ) - @test all([sum(v[:,col] .== 0) == expected_zeros for col in 1:n_out]) + v = sparse_init(n_in, n_out; sparsity=sparsity, std=σ) + @test all([sum(v[:, col] .== 0) == expected_zeros for col in 1:n_out]) @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ end @@ -247,7 +250,7 @@ const GROUP = get(ENV, "GROUP", "All") @test eltype(cl(rng, 4, 2)) == Float32 end end - + @testset "identity_init" begin @testset "Non-identity sizes" begin @test identity_init(2, 3)[:, end] == zeros(Float32, 2) @@ -266,4 +269,5 @@ const GROUP = get(ENV, "GROUP", "All") @testset "Aqua: Quality Assurance" begin Aqua.test_all(WeightInitializers; ambiguities=false) Aqua.test_ambiguities(WeightInitializers; recursive=false) + end end From e9d29ef6277221970413e09e1b2a2b8219b899f6 Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Sun, 25 Feb 2024 17:28:48 +0100 Subject: [PATCH 09/12] tidying up docstrings --- src/initializers.jl | 75 ++++++++++++++++++++++++++++++++------------- 1 file changed, 53 insertions(+), 22 deletions(-) diff --git a/src/initializers.jl b/src/initializers.jl index a35e6da..5a076ed 100644 --- a/src/initializers.jl +++ b/src/initializers.jl @@ -123,12 +123,17 @@ function truncated_normal(rng::AbstractRNG, ::Type{T}, dims::Integer...; mean=T( end """ - orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain = 1) where {T <: Real} -> AbstractArray{T, length(dims)} - orthogonal(rng::AbstractRNG; kw...) -> Function + orthogonal([::AbstractRNG=_default_rng()], [T=Float32], dims::Integer...; + gain = 1) -> AbstractArray{T, length(dims)} -Return an `AbstractArray{T}` of the given dimensions (`dims`) which is a (semi) orthogonal matrix, as described in [^Saxe14] +Return an `AbstractArray{T}` of the given dimensions (`dims`) which is a +(semi) orthogonal matrix, as described in [^Saxe14] -The function constructs an orthogonal or semi-orthogonal matrix depending on the specified dimensions. For two dimensions, it returns a matrix where `dims = (rows, cols)`. For more than two dimensions, it computes an orthogonal matrix of size `prod(dims[1:(end - 1)])` by `dims[end]` before reshaping it to the original dimensions. +The function constructs an orthogonal or semi-orthogonal matrix depending on the specified +dimensions. For two dimensions, it returns a matrix where `dims = (rows, cols)`. +For more than two dimensions, it computes an orthogonal matrix of +size `prod(dims[1:(end - 1)])` by `dims[end]` before reshaping it to +the original dimensions. Cannot construct a vector, i.e., `length(dims) == 1` is forbidden. @@ -141,7 +146,9 @@ Cannot construct a vector, i.e., `length(dims) == 1` is forbidden. # References -[^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120 +[^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of + learning in deep linear neural networks", + ICLR 2014, https://arxiv.org/abs/1312.6120 """ function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=T(1.0)) where {T <: Number} @@ -170,10 +177,16 @@ function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; end """ - sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; sparsity::Number, std::Number=0.01) where {T <: Number} -> AbstractArray{T} + sparse_init([::AbstractRNG=_default_rng()], [T=Float32], dims::Integer...; + sparsity::Number, std::Number=0.01) -> AbstractArray{T} -Creates a sparsely initialized weight matrix with a specified proportion of zeroed elements, using random numbers drawn from a normal distribution for the non-zero elements. This method is introduced in [^Martens2010]. -Note: The sparsity parameter controls the proportion of the matrix that will be zeroed. For example, a sparsity of 0.3 means that approximately 30% of the elements will be set to zero. The non-zero elements are distributed according to a normal distribution, scaled by the std parameter. +Creates a sparsely initialized weight matrix with a specified proportion of zeroed elements, +using random numbers drawn from a normal distribution for the non-zero elements. +This method is introduced in [^Martens2010]. +Note: The sparsity parameter controls the proportion of the matrix that will be zeroed. +For example, a sparsity of 0.3 means that approximately 30% of the elements will be +set to zero. The non-zero elements are distributed according to a normal distribution, +scaled by the std parameter. # Arguments @@ -181,11 +194,13 @@ Note: The sparsity parameter controls the proportion of the matrix that will be - `T::Type{<:Number}`: The numeric type of the elements in the returned array. - `dims::Integer...`: The dimensions of the weight matrix to be generated. - `sparsity::Number`: The proportion of elements to be zeroed. Must be between 0 and 1. - - `std::Number=0.01`: The standard deviation of the normal distribution before applying `gain`. + - `std::Number=0.01`: The standard deviation of the normal distribution + before applying `gain`. # Returns - - `AbstractArray{T}`: A sparsely initialized weight matrix of dimensions `dims` and type `T`. + - `AbstractArray{T}`: A sparsely initialized weight matrix of dimensions `dims` + and type `T`. # Examples @@ -208,7 +223,9 @@ matrix = sparse_init(rng, Float32, 5, 5; sparsity=0.3, std=0.01) # References -[^Martens2010] Martens, J, "Deep learning via Hessian-free optimization" _Proceedings of the 27th International Conference on International Conference on Machine Learning_. 2010. +[^Martens2010] Martens, J, "Deep learning via Hessian-free optimization" + _Proceedings of the 27th International Conference on International Conference + on Machine Learning_. 2010. """ function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; sparsity::Number, std::Number=T(0.01)) where {T <: Number} @@ -225,33 +242,47 @@ function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; end """ - identity_init(rng::AbstractRNG, ::Type{T}, size...; gain::Number=1, shift::Union{Integer, Tuple{Integer, Integer}}=0) where {T <: Number} -> AbstractArray{T} + identity_init([::AbstractRNG=_default_rng()], [T=Float32], size...; gain::Number=1, + shift::Union{Integer, Tuple{Integer, Integer}}=0) -> AbstractArray{T} -Constructs an array that aims to provide an identity mapping when used as parameters in most layers of a neural network. The identity mapping is scaled by the `gain` parameter. +Constructs an array that aims to provide an identity mapping when used as parameters in +most layers of a neural network. The identity mapping is scaled by the `gain` parameter. # Behavior - - 1D: Returns a `Vector` of zeros (useful for biases in layers where `input_size == output_size`). - - 2D: Returns an identity matrix (useful for fully connected layers with equal input and output sizes). - - More than 2D: Returns a tensor where the central slice along the last two dimensions is an identity matrix, and the rest are zeros (useful for convolutional layers, simulating an identity convolution). + - 1D: Returns a `Vector` of zeros (useful for biases in layers where + `input_size == output_size`). + - 2D: Returns an identity matrix + (useful for fully connected layers with equal input and output sizes). + - More than 2D: Returns a tensor where the central slice along the last + two dimensions is an identity matrix, and the rest are zeros + (useful for convolutional layers, simulating an identity convolution). # Caveats - - Not all layers will result in an identity mapping when using this initializer. Exceptions include recurrent and normalization layers. - - Layers must have `input_size == output_size` for a perfect identity mapping. In cases where this condition is not met, the function pads extra dimensions with zeros. - - For convolutional layers to achieve an identity mapping, kernel sizes must be odd, and appropriate padding must be applied to ensure the output feature maps are the same size as the input feature maps. + - Not all layers will result in an identity mapping when using this initializer. + Exceptions include recurrent and normalization layers. + - Layers must have `input_size == output_size` for a perfect identity mapping. + In cases where this condition is not met, the function pads extra dimensions with zeros. + - For convolutional layers to achieve an identity mapping, kernel sizes must be odd, + and appropriate padding must be applied to ensure the output + feature maps are the same size as the input feature maps. # Arguments - - `rng::AbstractRNG`: An optional random number generator, included for consistency with other initializers but ignored since the output is deterministic. + - `rng::AbstractRNG`: An optional random number generator, + included for consistency with other initializers but ignored since the + output is deterministic. - `T::Type{<:Number}`: The numeric type of the array elements. - `size...`: The dimensions of the array to be initialized. - `gain::Number=1`: A scaling factor applied to the identity mapping. - - `shift::Union{Integer, Tuple{Integer, Integer}}=0`: An integer or a tuple specifying the circular shift applied to the output array. + - `shift::Union{Integer, Tuple{Integer, Integer}}=0`: An integer or + a tuple specifying the circular shift applied to the output array. # Returns - - `AbstractArray{T}`: An array initialized to represent an identity mapping, scaled by `gain` and optionally shifted by `shift`. + - `AbstractArray{T}`: An array initialized to represent an identity mapping, + scaled by `gain` and optionally shifted by `shift`. # Examples From 214256fd731d8c3568c8e779f168579f269f8801 Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Sun, 25 Feb 2024 17:34:03 +0100 Subject: [PATCH 10/12] format --- src/initializers.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/initializers.jl b/src/initializers.jl index 5a076ed..357b41c 100644 --- a/src/initializers.jl +++ b/src/initializers.jl @@ -147,8 +147,8 @@ Cannot construct a vector, i.e., `length(dims) == 1` is forbidden. # References [^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of - learning in deep linear neural networks", - ICLR 2014, https://arxiv.org/abs/1312.6120 +learning in deep linear neural networks", +ICLR 2014, https://arxiv.org/abs/1312.6120 """ function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=T(1.0)) where {T <: Number} @@ -224,8 +224,8 @@ matrix = sparse_init(rng, Float32, 5, 5; sparsity=0.3, std=0.01) # References [^Martens2010] Martens, J, "Deep learning via Hessian-free optimization" - _Proceedings of the 27th International Conference on International Conference - on Machine Learning_. 2010. +_Proceedings of the 27th International Conference on International Conference +on Machine Learning_. 2010. """ function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; sparsity::Number, std::Number=T(0.01)) where {T <: Number} From 1946a556e21e6c0f2fa96fa3245691353126006f Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Mon, 26 Feb 2024 11:27:11 +0100 Subject: [PATCH 11/12] import fixes, adding inits to non-diffs list --- src/WeightInitializers.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/WeightInitializers.jl b/src/WeightInitializers.jl index b2db3cb..ad739bb 100644 --- a/src/WeightInitializers.jl +++ b/src/WeightInitializers.jl @@ -1,10 +1,9 @@ module WeightInitializers import PrecompileTools: @recompile_invalidations -using PartialFunctions, Random, SpecialFunctions, Statistics, LinearAlgebra @recompile_invalidations begin - using ChainRulesCore, PartialFunctions, Random, SpecialFunctions, Statistics + using ChainRulesCore, PartialFunctions, Random, SpecialFunctions, Statistics, LinearAlgebra end include("utils.jl") @@ -15,7 +14,8 @@ for f in [ :zeros64, :ones64, :rand64, :randn64, :zeros32, :ones32, :rand32, :randn32, :zeros16, :ones16, :rand16, :randn16, :zerosC64, :onesC64, :randC64, :randnC64, :zerosC32, :onesC32, :randC32, :randnC32, :zerosC16, :onesC16, :randC16, :randnC16, :glorot_normal, - :glorot_uniform, :kaiming_normal, :kaiming_uniform, :truncated_normal] + :glorot_uniform, :kaiming_normal, :kaiming_uniform, :truncated_normal, :orthogonal, + :sparse_init, :identity_init] @eval @non_differentiable $(f)(::Any...) end From 2af0c1871e6b51795db2a8437f56f115b3a57ab9 Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Mon, 26 Feb 2024 11:29:31 +0100 Subject: [PATCH 12/12] format --- src/WeightInitializers.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/WeightInitializers.jl b/src/WeightInitializers.jl index ad739bb..26b05eb 100644 --- a/src/WeightInitializers.jl +++ b/src/WeightInitializers.jl @@ -3,7 +3,8 @@ module WeightInitializers import PrecompileTools: @recompile_invalidations @recompile_invalidations begin - using ChainRulesCore, PartialFunctions, Random, SpecialFunctions, Statistics, LinearAlgebra + using ChainRulesCore, PartialFunctions, Random, SpecialFunctions, Statistics, + LinearAlgebra end include("utils.jl")