From b888479d528865c0baf38fc2295b3072b582a584 Mon Sep 17 00:00:00 2001
From: MartinuzziFrancesco <martinuzzi.francesco@gmail.com>
Date: Thu, 18 Jan 2024 14:11:37 +0100
Subject: [PATCH 01/12] rebase adding orthogonal

---
 Project.toml              |  2 ++
 src/WeightInitializers.jl |  2 ++
 src/initializers.jl       | 36 +++++++++++++++++++++++++++++++++++-
 test/runtests.jl          | 33 +++++++++++++++++++++++++++------
 4 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/Project.toml b/Project.toml
index a71f74f..06d33e8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,8 @@ version = "0.1.5"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930"
 PartialFunctions = "570af359-4316-4cb7-8c74-252c00c2016b"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/src/WeightInitializers.jl b/src/WeightInitializers.jl
index 446fa8f..869b5b6 100644
--- a/src/WeightInitializers.jl
+++ b/src/WeightInitializers.jl
@@ -1,6 +1,7 @@
 module WeightInitializers
 
 import PrecompileTools: @recompile_invalidations
+using PartialFunctions, Random, SpecialFunctions, Statistics, LinearAlgebra
 
 @recompile_invalidations begin
     using ChainRulesCore, PartialFunctions, Random, SpecialFunctions, Statistics
@@ -25,5 +26,6 @@ export zerosC64, onesC64, randC64, randnC64, zerosC32, onesC32, randC32, randnC3
 export glorot_normal, glorot_uniform
 export kaiming_normal, kaiming_uniform
 export truncated_normal
+export orthogonal
 
 end
diff --git a/src/initializers.jl b/src/initializers.jl
index ec9900d..7e10893 100644
--- a/src/initializers.jl
+++ b/src/initializers.jl
@@ -122,9 +122,43 @@ function truncated_normal(rng::AbstractRNG, ::Type{T}, dims::Integer...; mean=T(
     return xs
 end
 
+"""
+    orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain = 1) where {T <: Real} -> AbstractArray{T, length(dims)}
+    orthogonal(rng::AbstractRNG; kw...) -> Function
+
+Return an `AbstractArray{T}` of the given dimensions (`dims`) which is a (semi) orthogonal matrix, as described in [^Saxe14]
+
+The function constructs an orthogonal or semi-orthogonal matrix depending on the specified dimensions. For two dimensions, it returns a matrix where `dims = (rows, cols)`. For more than two dimensions, it computes an orthogonal matrix of size `prod(dims[1:(end - 1)])` by `dims[end]` before reshaping it to the original dimensions.
+
+Cannot construct a vector, i.e., `length(dims) == 1` is forbidden.
+
+# Arguments
+
+  - `rng::AbstractRNG`: Random number generator.
+  - `T::Type{<:Real}`: The type of the elements in the array.
+  - `dims::Integer...`: The dimensions of the array.
+  - `gain::Number`: Scaling factor for the elements of the orthogonal matrix.
+
+# References
+
+[^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120
+"""
+function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        gain::Number=1) where {T <: Real}
+    @assert length(dims) > 1 "Creating vectors (length(dims) == 1) is not allowed"
+    rows, cols = dims
+    if rows < cols
+        return permutedims(orthogonal(rng, T, cols, rows; gain))
+    end
+    mat = randn(rng, T, rows, cols)
+    Q, R = LinearAlgebra.qr(mat)
+    mat .= Array(Q) * sign.(LinearAlgebra.Diagonal(R)) .* T(gain)
+    return mat
+end
+
 # Default Fallbacks for all functions
 for initializer in (:glorot_uniform, :glorot_normal, :kaiming_uniform, :kaiming_normal,
-    :truncated_normal)
+    :truncated_normal, :orthogonal)
     NType = ifelse(initializer === :truncated_normal, Real, Number)
     @eval function ($initializer)(dims::Integer...; kwargs...)
         return $initializer(_default_rng(), Float32, dims...; kwargs...)
diff --git a/test/runtests.jl b/test/runtests.jl
index 4b4c595..061a809 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -32,7 +32,8 @@ const GROUP = get(ENV, "GROUP", "All")
 
     @testset "rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in rngs_arrtypes
         @testset "Sizes and Types: $init" for init in [zeros32, ones32, rand32, randn32,
-            kaiming_uniform, kaiming_normal, glorot_uniform, glorot_normal, truncated_normal
+            kaiming_uniform, kaiming_normal, glorot_uniform, glorot_normal,
+            truncated_normal, orthogonal,
         ]
             # Sizes
             @test size(init(3)) == (3,)
@@ -77,8 +78,7 @@ const GROUP = get(ENV, "GROUP", "All")
 
         @testset "AbstractArray Type: $init $T" for init in [kaiming_uniform,
                 kaiming_normal,
-                glorot_uniform, glorot_normal, truncated_normal],
-            T in (Float16, Float32,
+                glorot_uniform, glorot_normal, truncated_normal, orthogonal], T in (Float16, Float32,
                 Float64, ComplexF16, ComplexF32, ComplexF64)
 
             init === truncated_normal && !(T <: Real) && continue
@@ -98,11 +98,16 @@ const GROUP = get(ENV, "GROUP", "All")
         end
 
         @testset "Closure: $init" for init in [kaiming_uniform, kaiming_normal,
-            glorot_uniform, glorot_normal, truncated_normal]
+            glorot_uniform, glorot_normal, truncated_normal, orthogonal]
             cl = init(;)
             # Sizes
-            @test size(cl(3)) == (3,)
-            @test size(cl(rng, 3)) == (3,)
+            if init == orthogonal
+                @test_throws AssertionError cl(3)
+                @test_throws AssertionError cl(rng, 3)
+            else
+                @test size(cl(3)) == (3,)
+                @test size(cl(rng, 3)) == (3,)
+            end
             @test size(cl(3, 4)) == (3, 4)
             @test size(cl(rng, 3, 4)) == (3, 4)
             @test size(cl(3, 4, 5)) == (3, 4, 5)
@@ -141,6 +146,22 @@ const GROUP = get(ENV, "GROUP", "All")
             end
             @test eltype(init(3, 4; gain=1.5)) == Float32
         end
+        
+        @testset "orthogonal" begin
+            # A matrix of dim = (m,n) with m > n should produce a QR decomposition. In the other case, the transpose should be taken to compute the QR decomposition.
+            for (rows, cols) in [(5, 3), (3, 5)]
+                v = orthogonal(rows, cols)
+                rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
+            end
+            for mat in [(3, 4, 5), (2, 2, 5)]
+                v = orthogonal(mat...)
+                cols = mat[end]
+                rows = div(prod(mat), cols)
+                v = reshape(v, (rows, cols))
+                rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
+            end
+            @test eltype(orthogonal(3, 4; gain=1.5)) == Float32
+        end
     end
 
     @testset "Warning: truncated_normal" begin

From 867f09a84e58a89ffe182d174e0815dd13c656c8 Mon Sep 17 00:00:00 2001
From: MartinuzziFrancesco <martinuzzi.francesco@gmail.com>
Date: Sat, 20 Jan 2024 18:21:55 +0100
Subject: [PATCH 02/12] fixing orthogonal

---
 src/initializers.jl | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/initializers.jl b/src/initializers.jl
index 7e10893..4c9f13c 100644
--- a/src/initializers.jl
+++ b/src/initializers.jl
@@ -143,17 +143,29 @@ Cannot construct a vector, i.e., `length(dims) == 1` is forbidden.
 
 [^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120
 """
-function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...;
-        gain::Number=1) where {T <: Real}
+function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=T(1)) where {T <: Real}
     @assert length(dims) > 1 "Creating vectors (length(dims) == 1) is not allowed"
-    rows, cols = dims
+    
+    if length(dims) == 2
+        rows, cols = dims
+    else
+        rows = prod(dims[1:end-1])
+        cols = dims[end]
+    end
+
     if rows < cols
         return permutedims(orthogonal(rng, T, cols, rows; gain))
     end
+    
     mat = randn(rng, T, rows, cols)
     Q, R = LinearAlgebra.qr(mat)
     mat .= Array(Q) * sign.(LinearAlgebra.Diagonal(R)) .* T(gain)
-    return mat
+
+    if length(dims) > 2
+        return reshape(mat, dims)
+    else
+        return mat
+    end
 end
 
 # Default Fallbacks for all functions

From b56a09d3082b878ecaed93eb72f896f846c6a83f Mon Sep 17 00:00:00 2001
From: MartinuzziFrancesco <martinuzzi.francesco@gmail.com>
Date: Thu, 1 Feb 2024 21:31:03 +0100
Subject: [PATCH 03/12] rebase added identity_init, sparse_init

---
 ext/WeightInitializersCUDAExt.jl |  61 ++++++++++++-
 src/WeightInitializers.jl        |   2 +
 src/initializers.jl              | 149 +++++++++++++++++++++++++++++--
 test/Project.toml                |  11 +++
 test/runtests.jl                 |  22 ++---
 5 files changed, 225 insertions(+), 20 deletions(-)
 create mode 100644 test/Project.toml

diff --git a/ext/WeightInitializersCUDAExt.jl b/ext/WeightInitializersCUDAExt.jl
index 4d6e365..eb04364 100644
--- a/ext/WeightInitializersCUDAExt.jl
+++ b/ext/WeightInitializersCUDAExt.jl
@@ -1,7 +1,7 @@
 module WeightInitializersCUDAExt
 
 using WeightInitializers, CUDA
-import WeightInitializers: __partial_apply, NUM_TO_FPOINT
+import WeightInitializers: __partial_apply, NUM_TO_FPOINT, identity_init, sparse_init
 
 const AbstractCuRNG = Union{CUDA.RNG, CURAND.RNG}
 
@@ -19,4 +19,63 @@ for T in ("16", "32", "64", "C16", "C32", "C64"), fname in (:ones, :zeros)
     end
 end
 
+function identity_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...;
+        gain::Number=1, shift::Integer=0) where {T <: Number}
+    if length(dims) == 1
+        # Bias initialization
+        return CUDA.zeros(T, dims...)
+    elseif length(dims) == 2
+        # Matrix multiplication
+        rows, cols = dims
+        mat = CUDA.zeros(T, rows, cols)
+        diag_indices = 1:min(rows, cols)
+        CUDA.fill!(view(mat, diag_indices, diag_indices), gain)
+        return CUDA.circshift(mat, shift)
+    else
+        # Convolution or more dimensions
+        nin, nout = dims[end - 1], dims[end]
+        centers = map(d -> cld(d, 2), dims[1:(end - 2)])
+        weights = CUDA.zeros(T, dims...)
+        #we should really find a better way to do this
+        CUDA.@allowscalar for i in 1:min(nin, nout)
+            index = (centers..., i, i)
+            weights[index...] = gain
+        end
+        return CUDA.circshift(weights, (ntuple(d -> 0, length(dims) - 2)..., shift, shift))
+    end
+end
+
+function sparse_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...;
+        sparsity::Number, std::Number=T(0.01)) where {T <: Number}
+    if length(dims) != 2
+        throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization."))
+    end
+
+    rows, cols = dims
+    prop_zero = min(1.0, sparsity)
+    num_zeros = ceil(Integer, prop_zero * rows)
+    sparse_array = randn(rng, T, dims...) .* std
+    sparse_array[1:num_zeros, :] .= CUDA.zero(T)
+
+    for col in 1:cols
+        sparse_array[:, col] = CUDA.shuffle(rng, sparse_array[:, col])
+    end
+
+    return sparse_array
+end
+
+for initializer in (:sparse_init, :identity_init)
+    @eval function ($initializer)(rng::AbstractCuRNG, dims::Integer...; kwargs...)
+        return $initializer(rng, Float32, dims...; kwargs...)
+    end
+
+    @eval function ($initializer)(rng::AbstractCuRNG; kwargs...)
+        return __partial_apply($initializer, (rng, (; kwargs...)))
+    end
+    @eval function ($initializer)(rng::AbstractCuRNG,
+            ::Type{T}; kwargs...) where {T <: Number}
+        return __partial_apply($initializer, ((rng, T), (; kwargs...)))
+    end
+end
+
 end
diff --git a/src/WeightInitializers.jl b/src/WeightInitializers.jl
index 869b5b6..b2db3cb 100644
--- a/src/WeightInitializers.jl
+++ b/src/WeightInitializers.jl
@@ -27,5 +27,7 @@ export glorot_normal, glorot_uniform
 export kaiming_normal, kaiming_uniform
 export truncated_normal
 export orthogonal
+export sparse_init
+export identity_init
 
 end
diff --git a/src/initializers.jl b/src/initializers.jl
index 4c9f13c..3e1f99a 100644
--- a/src/initializers.jl
+++ b/src/initializers.jl
@@ -143,20 +143,23 @@ Cannot construct a vector, i.e., `length(dims) == 1` is forbidden.
 
 [^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120
 """
-function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=T(1)) where {T <: Real}
-    @assert length(dims) > 1 "Creating vectors (length(dims) == 1) is not allowed"
-    
+function orthogonal(rng::AbstractRNG,
+        ::Type{T},
+        dims::Integer...;
+        gain::Number=T(1)) where {T <: Real}
+    @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed"
+
     if length(dims) == 2
         rows, cols = dims
     else
-        rows = prod(dims[1:end-1])
+        rows = prod(dims[1:(end - 1)])
         cols = dims[end]
     end
 
     if rows < cols
         return permutedims(orthogonal(rng, T, cols, rows; gain))
     end
-    
+
     mat = randn(rng, T, rows, cols)
     Q, R = LinearAlgebra.qr(mat)
     mat .= Array(Q) * sign.(LinearAlgebra.Diagonal(R)) .* T(gain)
@@ -168,9 +171,143 @@ function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain::Number=
     end
 end
 
+"""
+    sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; sparsity::Number, std::Number=0.01) where {T <: Number} -> AbstractArray{T}
+
+Creates a sparsely initialized weight matrix with a specified proportion of zeroed elements, using random numbers drawn from a normal distribution for the non-zero elements. This method is introduced in [^Martens2010].
+Note: The sparsity parameter controls the proportion of the matrix that will be zeroed. For example, a sparsity of 0.3 means that approximately 30% of the elements will be set to zero. The non-zero elements are distributed according to a normal distribution, scaled by the std parameter.
+
+# Arguments
+
+  - `rng::AbstractRNG`: The random number generator to use.
+  - `T::Type{<:Number}`: The numeric type of the elements in the returned array.
+  - `dims::Integer...`: The dimensions of the weight matrix to be generated.
+  - `sparsity::Number`: The proportion of elements to be zeroed. Must be between 0 and 1.
+  - `std::Number=0.01`: The standard deviation of the normal distribution before applying `gain`.
+
+# Returns
+
+  - `AbstractArray{T}`: A sparsely initialized weight matrix of dimensions `dims` and type `T`.
+
+# Examples
+
+```julia
+using Random
+
+# Initialize a 5x5 sparsely initialized matrix with 30% sparsity
+rng = MersenneTwister(123)
+matrix = sparse_init(rng, Float32, 5, 5; sparsity=0.3, std=0.01)
+```
+
+```
+5×5 Matrix{Float64}:
+  0.0          0.00273815    0.00592403   0.0          0.0
+  0.00459416  -0.000754831  -0.00888936  -0.0077507    0.0
+  0.0         -0.00194229    0.0          0.0         -0.00468489
+  0.0114265    0.0           0.0         -0.00734886   0.00277726
+ -0.00396679   0.0           0.00327215  -0.0071741   -0.00880897
+```
+
+# References
+
+[^Martens2010] Martens, J, "Deep learning via Hessian-free optimization" _Proceedings of the 27th International Conference on International Conference on Machine Learning_. 2010.
+"""
+function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        sparsity::Number, std::Number=T(0.01)) where {T <: Number}
+    if length(dims) != 2
+        throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization."))
+    end
+
+    rows, cols = dims
+    prop_zero = min(1.0, sparsity)
+    num_zeros = ceil(Integer, prop_zero * rows)
+    sparse_array = randn(rng, T, dims...) .* std
+    sparse_array[1:num_zeros, :] .= zero(T)
+
+    for col in 1:cols
+        sparse_array[:, col] = shuffle(rng, sparse_array[:, col])
+    end
+
+    return sparse_array
+end
+
+"""
+    identity_init(rng::AbstractRNG, ::Type{T}, size...; gain::Number=1, shift::Union{Integer, Tuple{Integer, Integer}}=0) where {T <: Number} -> AbstractArray{T}
+
+Constructs an array that aims to provide an identity mapping when used as parameters in most layers of a neural network. The identity mapping is scaled by the `gain` parameter.
+
+# Behavior
+
+  - 1D: Returns a `Vector` of zeros (useful for biases in layers where `input_size == output_size`).
+  - 2D: Returns an identity matrix (useful for fully connected layers with equal input and output sizes).
+  - More than 2D: Returns a tensor where the central slice along the last two dimensions is an identity matrix, and the rest are zeros (useful for convolutional layers, simulating an identity convolution).
+
+# Caveats
+
+  - Not all layers will result in an identity mapping when using this initializer. Exceptions include recurrent and normalization layers.
+  - Layers must have `input_size == output_size` for a perfect identity mapping. In cases where this condition is not met, the function pads extra dimensions with zeros.
+  - For convolutional layers to achieve an identity mapping, kernel sizes must be odd, and appropriate padding must be applied to ensure the output feature maps are the same size as the input feature maps.
+
+# Arguments
+
+  - `rng::AbstractRNG`: An optional random number generator, included for consistency with other initializers but ignored since the output is deterministic.
+  - `T::Type{<:Number}`: The numeric type of the array elements.
+  - `size...`: The dimensions of the array to be initialized.
+  - `gain::Number=1`: A scaling factor applied to the identity mapping.
+  - `shift::Union{Integer, Tuple{Integer, Integer}}=0`: An integer or a tuple specifying the circular shift applied to the output array.
+
+# Returns
+
+  - `AbstractArray{T}`: An array initialized to represent an identity mapping, scaled by `gain` and optionally shifted by `shift`.
+
+# Examples
+
+```julia
+using Random
+
+# Identity matrix for fully connected layer
+identity_matrix = identity_init(MersenneTwister(123), Float32, 5, 5)
+
+# Identity tensor for convolutional layer
+identity_tensor = identity_init(MersenneTwister(123),
+    Float32,        # Bias initialization
+    3,
+    3,
+    5,        # Matrix multiplication
+    5;
+    gain=1.5,
+    shift=(1, 0))
+```
+"""
+function identity_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        gain::Number=1, shift::Integer=0) where {T <: Number}
+    if length(dims) == 1
+        # Bias initialization
+        return zeros(T, dims...)
+    elseif length(dims) == 2
+        # Matrix multiplication
+        rows, cols = dims
+        mat = zeros(T, rows, cols)
+        for i in 1:min(rows, cols)
+            mat[i, i] = gain
+        end
+        return circshift(mat, shift)
+    else
+        # Convolution or more dimensions
+        nin, nout = dims[end - 1], dims[end]
+        centers = map(d -> cld(d, 2), dims[1:(end - 2)])
+        weights = zeros(T, dims...)
+        for i in 1:min(nin, nout)
+            index = (centers..., i, i)
+            weights[index...] = gain
+        end
+        return circshift(weights, (ntuple(d -> 0, length(dims) - 2)..., shift, shift))
+    end
+end
+
 # Default Fallbacks for all functions
 for initializer in (:glorot_uniform, :glorot_normal, :kaiming_uniform, :kaiming_normal,
-    :truncated_normal, :orthogonal)
+    :truncated_normal, :orthogonal, :sparse_init, :identity_init)
     NType = ifelse(initializer === :truncated_normal, Real, Number)
     @eval function ($initializer)(dims::Integer...; kwargs...)
         return $initializer(_default_rng(), Float32, dims...; kwargs...)
diff --git a/test/Project.toml b/test/Project.toml
new file mode 100644
index 0000000..0adcca7
--- /dev/null
+++ b/test/Project.toml
@@ -0,0 +1,11 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[compat]
+julia = "1.6"
diff --git a/test/runtests.jl b/test/runtests.jl
index 061a809..647e458 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,5 +1,6 @@
-using Aqua, WeightInitializers, Test, Statistics
-using StableRNGs, Random, CUDA
+using Aqua
+using WeightInitializers, Test, SafeTestsets, Statistics
+using StableRNGs, Random, CUDA, LinearAlgebra
 
 CUDA.allowscalar(false)
 
@@ -33,7 +34,7 @@ const GROUP = get(ENV, "GROUP", "All")
     @testset "rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in rngs_arrtypes
         @testset "Sizes and Types: $init" for init in [zeros32, ones32, rand32, randn32,
             kaiming_uniform, kaiming_normal, glorot_uniform, glorot_normal,
-            truncated_normal, orthogonal,
+            truncated_normal, identity_init,
         ]
             # Sizes
             @test size(init(3)) == (3,)
@@ -78,7 +79,7 @@ const GROUP = get(ENV, "GROUP", "All")
 
         @testset "AbstractArray Type: $init $T" for init in [kaiming_uniform,
                 kaiming_normal,
-                glorot_uniform, glorot_normal, truncated_normal, orthogonal], T in (Float16, Float32,
+                glorot_uniform, glorot_normal, truncated_normal, identity_init], T in (Float16, Float32,
                 Float64, ComplexF16, ComplexF32, ComplexF64)
 
             init === truncated_normal && !(T <: Real) && continue
@@ -98,16 +99,11 @@ const GROUP = get(ENV, "GROUP", "All")
         end
 
         @testset "Closure: $init" for init in [kaiming_uniform, kaiming_normal,
-            glorot_uniform, glorot_normal, truncated_normal, orthogonal]
+            glorot_uniform, glorot_normal, truncated_normal, identity_init]
             cl = init(;)
             # Sizes
-            if init == orthogonal
-                @test_throws AssertionError cl(3)
-                @test_throws AssertionError cl(rng, 3)
-            else
-                @test size(cl(3)) == (3,)
-                @test size(cl(rng, 3)) == (3,)
-            end
+            @test size(cl(3)) == (3,)
+            @test size(cl(rng, 3)) == (3,)
             @test size(cl(3, 4)) == (3, 4)
             @test size(cl(rng, 3, 4)) == (3, 4)
             @test size(cl(3, 4, 5)) == (3, 4, 5)
@@ -146,7 +142,7 @@ const GROUP = get(ENV, "GROUP", "All")
             end
             @test eltype(init(3, 4; gain=1.5)) == Float32
         end
-        
+
         @testset "orthogonal" begin
             # A matrix of dim = (m,n) with m > n should produce a QR decomposition. In the other case, the transpose should be taken to compute the QR decomposition.
             for (rows, cols) in [(5, 3), (3, 5)]

From ee6fe0037b511f9f8f00d93bade43ce28056131e Mon Sep 17 00:00:00 2001
From: MartinuzziFrancesco <martinuzzi.francesco@gmail.com>
Date: Mon, 12 Feb 2024 18:40:53 +0100
Subject: [PATCH 04/12] rebase test structure for orthogonal, small fixes

---
 ext/WeightInitializersCUDAExt.jl | 29 ++++++++++++++++++++-
 src/initializers.jl              |  9 +++----
 test/runtests.jl                 | 43 +++++++++++++++++++++++++++++---
 3 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/ext/WeightInitializersCUDAExt.jl b/ext/WeightInitializersCUDAExt.jl
index eb04364..1137d1f 100644
--- a/ext/WeightInitializersCUDAExt.jl
+++ b/ext/WeightInitializersCUDAExt.jl
@@ -1,7 +1,7 @@
 module WeightInitializersCUDAExt
 
 using WeightInitializers, CUDA
-import WeightInitializers: __partial_apply, NUM_TO_FPOINT, identity_init, sparse_init
+import WeightInitializers: __partial_apply, NUM_TO_FPOINT, identity_init, sparse_init, orthogonal
 
 const AbstractCuRNG = Union{CUDA.RNG, CURAND.RNG}
 
@@ -19,6 +19,33 @@ for T in ("16", "32", "64", "C16", "C32", "C64"), fname in (:ones, :zeros)
     end
 end
 
+function orthogonal(rng::AbstractCuRNG, ::Type{T}, dims::Integer...;
+        gain::Number=T(1.0)) where {T <: Number}
+   @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed"
+
+    if length(dims) == 2
+        rows, cols = dims
+    else
+        rows = prod(dims[1:(end - 1)])
+        cols = dims[end]
+    end
+
+    if rows < cols
+        return CUDA.permutedims(orthogonal(rng, T, cols, rows; gain))
+    end
+
+    mat = randn(rng, T, rows, cols)
+    Q, R = CUDA.qr(mat)
+    mat .= Q * sign.(CUDA.diag(R)) .* T(gain)
+
+    if length(dims) > 2
+        return CUDA.reshape(mat, dims)
+    else
+        return mat
+    end
+end
+
+
 function identity_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...;
         gain::Number=1, shift::Integer=0) where {T <: Number}
     if length(dims) == 1
diff --git a/src/initializers.jl b/src/initializers.jl
index 3e1f99a..c8141ff 100644
--- a/src/initializers.jl
+++ b/src/initializers.jl
@@ -143,11 +143,10 @@ Cannot construct a vector, i.e., `length(dims) == 1` is forbidden.
 
 [^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120
 """
-function orthogonal(rng::AbstractRNG,
-        ::Type{T},
-        dims::Integer...;
-        gain::Number=T(1)) where {T <: Real}
-    @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed"
+function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+    gain::Number=T(1.0)) where {T <: Number}
+
+   @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed"
 
     if length(dims) == 2
         rows, cols = dims
diff --git a/test/runtests.jl b/test/runtests.jl
index 647e458..c13ac51 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -160,9 +160,46 @@ const GROUP = get(ENV, "GROUP", "All")
         end
     end
 
-    @testset "Warning: truncated_normal" begin
-        @test_warn "Mean is more than 2 std outside the limits in truncated_normal, so \
-            the distribution of values may be inaccurate." truncated_normal(2; mean=-5.0f0)
+    @testset "Orthogonal rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in rngs_arrtypes
+        # A matrix of dim = (m,n) with m > n should produce a QR decomposition.
+        # In the other case, the transpose should be taken to compute the QR decomposition.
+        for (rows, cols) in [(5, 3), (3, 5)]
+            v = orthogonal(rng, rows, cols)
+            CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
+        end
+        for mat in [(3, 4, 5), (2, 2, 5)]
+            v = orthogonal(rng, mat...)
+            cols = mat[end]
+            rows = div(prod(mat), cols)
+            v = reshape(v, (rows, cols))
+            CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
+        end
+        # Type
+        @testset "Orthogonal Types $T" for T in (Float16, Float32, Float64)
+            @test eltype(orthogonal(rng, T, 3, 4; gain=1.5)) == T
+            @test eltype(orthogonal(rng, T, 3, 4, 5; gain=1.5)) == T
+        end
+        @testset "Orthogonal AbstractArray Type $T" for T in (Float16, Float32, Float64)
+            @test orthogonal(T, 3, 5) isa AbstractArray{T, 2}
+            @test orthogonal(rng, T, 3, 5) isa arrtype{T, 2}
+
+            cl = orthogonal(rng)
+            @test cl(T, 3, 5) isa arrtype{T, 2}
+
+            cl = orthogonal(rng, T)
+            @test cl(3, 5) isa arrtype{T, 2}
+        end
+        @testset "Orthogonal Closure" begin
+            cl = orthogonal(;)
+            # Sizes
+            @test size(cl(3, 4)) == (3, 4)
+            @test size(cl(rng, 3, 4)) == (3, 4)
+            @test size(cl(3, 4, 5)) == (3, 4, 5)
+            @test size(cl(rng, 3, 4, 5)) == (3, 4, 5)
+            # Type
+            @test eltype(cl(4, 2)) == Float32
+            @test eltype(cl(rng, 4, 2)) == Float32
+        end
     end
 
     @testset "Aqua: Quality Assurance" begin

From 215cd5e981a6a954a5ae22daaebdd3bd2a517ca8 Mon Sep 17 00:00:00 2001
From: MartinuzziFrancesco <martinuzzi.francesco@gmail.com>
Date: Tue, 20 Feb 2024 17:33:46 +0100
Subject: [PATCH 05/12] small fixes and finalizing tests

---
 ext/WeightInitializersCUDAExt.jl | 50 ++++++-----------------
 src/initializers.jl              | 11 ++----
 test/runtests.jl                 | 68 ++++++++++++++++++++++++++++++--
 3 files changed, 79 insertions(+), 50 deletions(-)

diff --git a/ext/WeightInitializersCUDAExt.jl b/ext/WeightInitializersCUDAExt.jl
index 1137d1f..6de1f27 100644
--- a/ext/WeightInitializersCUDAExt.jl
+++ b/ext/WeightInitializersCUDAExt.jl
@@ -1,6 +1,7 @@
 module WeightInitializersCUDAExt
 
 using WeightInitializers, CUDA
+using Random
 import WeightInitializers: __partial_apply, NUM_TO_FPOINT, identity_init, sparse_init, orthogonal
 
 const AbstractCuRNG = Union{CUDA.RNG, CURAND.RNG}
@@ -19,30 +20,20 @@ for T in ("16", "32", "64", "C16", "C32", "C64"), fname in (:ones, :zeros)
     end
 end
 
-function orthogonal(rng::AbstractCuRNG, ::Type{T}, dims::Integer...;
-        gain::Number=T(1.0)) where {T <: Number}
-   @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed"
 
-    if length(dims) == 2
-        rows, cols = dims
-    else
-        rows = prod(dims[1:(end - 1)])
-        cols = dims[end]
-    end
-
-    if rows < cols
-        return CUDA.permutedims(orthogonal(rng, T, cols, rows; gain))
+function sparse_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...;
+    sparsity::Number, std::Number=T(0.01)) where {T <: Number}
+    if length(dims) != 2
+        throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization."))
     end
 
-    mat = randn(rng, T, rows, cols)
-    Q, R = CUDA.qr(mat)
-    mat .= Q * sign.(CUDA.diag(R)) .* T(gain)
+    rows, cols = dims
+    prop_zero = min(1.0, sparsity)
+    num_zeros = ceil(Integer, prop_zero * rows)
+    sparse_array = randn(rng, T, dims...) .* std
+    sparse_array[1:num_zeros, :] .= CUDA.zero(T)
 
-    if length(dims) > 2
-        return CUDA.reshape(mat, dims)
-    else
-        return mat
-    end
+    return CUDA.@allowscalar mapslices(shuffle, sparse_array, dims=1)
 end
 
 
@@ -72,25 +63,6 @@ function identity_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...;
     end
 end
 
-function sparse_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...;
-        sparsity::Number, std::Number=T(0.01)) where {T <: Number}
-    if length(dims) != 2
-        throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization."))
-    end
-
-    rows, cols = dims
-    prop_zero = min(1.0, sparsity)
-    num_zeros = ceil(Integer, prop_zero * rows)
-    sparse_array = randn(rng, T, dims...) .* std
-    sparse_array[1:num_zeros, :] .= CUDA.zero(T)
-
-    for col in 1:cols
-        sparse_array[:, col] = CUDA.shuffle(rng, sparse_array[:, col])
-    end
-
-    return sparse_array
-end
-
 for initializer in (:sparse_init, :identity_init)
     @eval function ($initializer)(rng::AbstractCuRNG, dims::Integer...; kwargs...)
         return $initializer(rng, Float32, dims...; kwargs...)
diff --git a/src/initializers.jl b/src/initializers.jl
index c8141ff..2f771cb 100644
--- a/src/initializers.jl
+++ b/src/initializers.jl
@@ -160,8 +160,8 @@ function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...;
     end
 
     mat = randn(rng, T, rows, cols)
-    Q, R = LinearAlgebra.qr(mat)
-    mat .= Array(Q) * sign.(LinearAlgebra.Diagonal(R)) .* T(gain)
+    Q, R = qr(mat)
+    mat .= Q * sign.(Diagonal(R)) .* T(gain)
 
     if length(dims) > 2
         return reshape(mat, dims)
@@ -222,12 +222,7 @@ function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
     num_zeros = ceil(Integer, prop_zero * rows)
     sparse_array = randn(rng, T, dims...) .* std
     sparse_array[1:num_zeros, :] .= zero(T)
-
-    for col in 1:cols
-        sparse_array[:, col] = shuffle(rng, sparse_array[:, col])
-    end
-
-    return sparse_array
+    return mapslices(shuffle, sparse_array, dims=1)
 end
 
 """
diff --git a/test/runtests.jl b/test/runtests.jl
index c13ac51..ee797c2 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -175,11 +175,11 @@ const GROUP = get(ENV, "GROUP", "All")
             CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
         end
         # Type
-        @testset "Orthogonal Types $T" for T in (Float16, Float32, Float64)
+        @testset "Orthogonal Types $T" for T in (Float32, Float64)#(Float16, Float32, Float64)
             @test eltype(orthogonal(rng, T, 3, 4; gain=1.5)) == T
             @test eltype(orthogonal(rng, T, 3, 4, 5; gain=1.5)) == T
         end
-        @testset "Orthogonal AbstractArray Type $T" for T in (Float16, Float32, Float64)
+        @testset "Orthogonal AbstractArray Type $T" for T in (Float32, Float64)#(Float16, Float32, Float64)
             @test orthogonal(T, 3, 5) isa AbstractArray{T, 2}
             @test orthogonal(rng, T, 3, 5) isa arrtype{T, 2}
 
@@ -202,8 +202,70 @@ const GROUP = get(ENV, "GROUP", "All")
         end
     end
 
+    @testset "sparse_init rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in rngs_arrtypes
+        # sparse_init should yield an error for non 2-d dimensions
+        # sparse_init should yield no zero elements if sparsity < 0
+        # sparse_init should yield all zero elements if sparsity > 1
+        # sparse_init should yield exactly ceil(n_in * sparsity) elements in each column for other sparsity values
+        # sparse_init should yield a kernel in its non-zero elements consistent with the std parameter
+
+        @test_throws ArgumentError sparse_init(3, 4, 5, sparsity=0.1)
+        @test_throws ArgumentError sparse_init(3, sparsity=0.1)
+        v = sparse_init(100, 100, sparsity=-0.1)
+        @test sum(v .== 0) == 0
+        v = sparse_init(100, 100, sparsity=1.1)
+        @test sum(v .== 0) == length(v)
+
+        for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)]
+            expected_zeros = ceil(Integer, n_in * sparsity)
+            v = sparse_init(n_in, n_out, sparsity=sparsity, std=σ)
+            @test all([sum(v[:,col] .== 0) == expected_zeros for col in 1:n_out])
+            @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ
+        end
+
+        # Type
+        @testset "sparse_init Types $T" for T in (Float16, Float32, Float64)
+            @test eltype(sparse_init(rng, T, 3, 4; sparsity=0.5)) == T
+        end
+        @testset "sparse_init AbstractArray Type $T" for T in (Float16, Float32, Float64)
+            @test sparse_init(T, 3, 5; sparsity=0.5) isa AbstractArray{T, 2}
+            @test sparse_init(rng, T, 3, 5; sparsity=0.5) isa arrtype{T, 2}
+
+            cl = sparse_init(rng; sparsity=0.5)
+            @test cl(T, 3, 5) isa arrtype{T, 2}
+
+            cl = sparse_init(rng, T; sparsity=0.5)
+            @test cl(3, 5) isa arrtype{T, 2}
+        end
+        @testset "sparse_init Closure" begin
+            cl = sparse_init(; sparsity=0.5)
+            # Sizes
+            @test size(cl(3, 4)) == (3, 4)
+            @test size(cl(rng, 3, 4)) == (3, 4)
+            # Type
+            @test eltype(cl(4, 2)) == Float32
+            @test eltype(cl(rng, 4, 2)) == Float32
+        end
+    end
+    
+    @testset "identity_init" begin
+        @testset "Non-identity sizes" begin
+            @test identity_init(2, 3)[:, end] == zeros(Float32, 2)
+            @test identity_init(3, 2; shift=1)[1, :] == zeros(Float32, 2)
+            @test identity_init(1, 1, 3, 4)[:, :, :, end] == zeros(Float32, 1, 1, 3)
+            @test identity_init(2, 1, 3, 3)[end, :, :, :] == zeros(Float32, 1, 3, 3)
+            @test identity_init(1, 2, 3, 3)[:, end, :, :] == zeros(Float32, 1, 3, 3)
+        end
+    end
+
+    @static if VERSION ≥ v"1.9"
+        @testset "Warning: truncated_normal" begin
+            @test_warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." truncated_normal(2;
+                mean=-5.0f0)
+        end
+    end
+
     @testset "Aqua: Quality Assurance" begin
         Aqua.test_all(WeightInitializers; ambiguities=false)
         Aqua.test_ambiguities(WeightInitializers; recursive=false)
-    end
 end

From 24485b5a468386f7e66cb3c962a8dada7265a974 Mon Sep 17 00:00:00 2001
From: MartinuzziFrancesco <martinuzzi.francesco@gmail.com>
Date: Fri, 23 Feb 2024 21:26:18 +0100
Subject: [PATCH 06/12] small fix

---
 test/runtests.jl | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index ee797c2..4cc13c3 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -258,11 +258,9 @@ const GROUP = get(ENV, "GROUP", "All")
         end
     end
 
-    @static if VERSION ≥ v"1.9"
-        @testset "Warning: truncated_normal" begin
-            @test_warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." truncated_normal(2;
-                mean=-5.0f0)
-        end
+    @testset "Warning: truncated_normal" begin
+        @test_warn "Mean is more than 2 std outside the limits in truncated_normal, so \
+            the distribution of values may be inaccurate." truncated_normal(2; mean=-5.0f0)
     end
 
     @testset "Aqua: Quality Assurance" begin

From a199793f9014081b158bab5729ac5ae624c516b0 Mon Sep 17 00:00:00 2001
From: MartinuzziFrancesco <martinuzzi.francesco@gmail.com>
Date: Fri, 23 Feb 2024 21:29:07 +0100
Subject: [PATCH 07/12] up version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 06d33e8..444f032 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "WeightInitializers"
 uuid = "d49dbf32-c5c2-4618-8acc-27bb2598ef2d"
 authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
-version = "0.1.5"
+version = "0.1.6"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"

From 5cdf8c4b976762d10220b3e2adb46985a5e0e8dd Mon Sep 17 00:00:00 2001
From: MartinuzziFrancesco <martinuzzi.francesco@gmail.com>
Date: Fri, 23 Feb 2024 21:44:17 +0100
Subject: [PATCH 08/12] final fixes

---
 Project.toml                     |  2 +-
 ext/WeightInitializersCUDAExt.jl |  7 +++----
 src/initializers.jl              |  7 +++----
 test/Project.toml                | 11 -----------
 test/runtests.jl                 | 24 ++++++++++++++----------
 5 files changed, 21 insertions(+), 30 deletions(-)
 delete mode 100644 test/Project.toml

diff --git a/Project.toml b/Project.toml
index 444f032..97d73c1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,7 +6,6 @@ version = "0.1.6"
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930"
 PartialFunctions = "570af359-4316-4cb7-8c74-252c00c2016b"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -23,6 +22,7 @@ WeightInitializersCUDAExt = "CUDA"
 Aqua = "0.8"
 CUDA = "5"
 ChainRulesCore = "1.21"
+LinearAlgebra = "1.9"
 PartialFunctions = "1.2"
 PrecompileTools = "1.2"
 Random = "1.9"
diff --git a/ext/WeightInitializersCUDAExt.jl b/ext/WeightInitializersCUDAExt.jl
index 6de1f27..45b91df 100644
--- a/ext/WeightInitializersCUDAExt.jl
+++ b/ext/WeightInitializersCUDAExt.jl
@@ -2,7 +2,8 @@ module WeightInitializersCUDAExt
 
 using WeightInitializers, CUDA
 using Random
-import WeightInitializers: __partial_apply, NUM_TO_FPOINT, identity_init, sparse_init, orthogonal
+import WeightInitializers: __partial_apply, NUM_TO_FPOINT, identity_init, sparse_init,
+                           orthogonal
 
 const AbstractCuRNG = Union{CUDA.RNG, CURAND.RNG}
 
@@ -20,9 +21,8 @@ for T in ("16", "32", "64", "C16", "C32", "C64"), fname in (:ones, :zeros)
     end
 end
 
-
 function sparse_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...;
-    sparsity::Number, std::Number=T(0.01)) where {T <: Number}
+        sparsity::Number, std::Number=T(0.01)) where {T <: Number}
     if length(dims) != 2
         throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization."))
     end
@@ -36,7 +36,6 @@ function sparse_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...;
     return CUDA.@allowscalar mapslices(shuffle, sparse_array, dims=1)
 end
 
-
 function identity_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...;
         gain::Number=1, shift::Integer=0) where {T <: Number}
     if length(dims) == 1
diff --git a/src/initializers.jl b/src/initializers.jl
index 2f771cb..a35e6da 100644
--- a/src/initializers.jl
+++ b/src/initializers.jl
@@ -144,9 +144,8 @@ Cannot construct a vector, i.e., `length(dims) == 1` is forbidden.
 [^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120
 """
 function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...;
-    gain::Number=T(1.0)) where {T <: Number}
-
-   @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed"
+        gain::Number=T(1.0)) where {T <: Number}
+    @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed"
 
     if length(dims) == 2
         rows, cols = dims
@@ -222,7 +221,7 @@ function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
     num_zeros = ceil(Integer, prop_zero * rows)
     sparse_array = randn(rng, T, dims...) .* std
     sparse_array[1:num_zeros, :] .= zero(T)
-    return mapslices(shuffle, sparse_array, dims=1)
+    return mapslices(shuffle, sparse_array; dims=1)
 end
 
 """
diff --git a/test/Project.toml b/test/Project.toml
deleted file mode 100644
index 0adcca7..0000000
--- a/test/Project.toml
+++ /dev/null
@@ -1,11 +0,0 @@
-[deps]
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
-StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
-Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[compat]
-julia = "1.6"
diff --git a/test/runtests.jl b/test/runtests.jl
index 4cc13c3..a2afe08 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,5 +1,5 @@
 using Aqua
-using WeightInitializers, Test, SafeTestsets, Statistics
+using WeightInitializers, Test, Statistics
 using StableRNGs, Random, CUDA, LinearAlgebra
 
 CUDA.allowscalar(false)
@@ -34,7 +34,7 @@ const GROUP = get(ENV, "GROUP", "All")
     @testset "rng = $(typeof(rng)) & arrtype = $arrtype" for (rng, arrtype) in rngs_arrtypes
         @testset "Sizes and Types: $init" for init in [zeros32, ones32, rand32, randn32,
             kaiming_uniform, kaiming_normal, glorot_uniform, glorot_normal,
-            truncated_normal, identity_init,
+            truncated_normal, identity_init
         ]
             # Sizes
             @test size(init(3)) == (3,)
@@ -79,7 +79,8 @@ const GROUP = get(ENV, "GROUP", "All")
 
         @testset "AbstractArray Type: $init $T" for init in [kaiming_uniform,
                 kaiming_normal,
-                glorot_uniform, glorot_normal, truncated_normal, identity_init], T in (Float16, Float32,
+                glorot_uniform, glorot_normal, truncated_normal, identity_init],
+            T in (Float16, Float32,
                 Float64, ComplexF16, ComplexF32, ComplexF64)
 
             init === truncated_normal && !(T <: Real) && continue
@@ -165,14 +166,16 @@ const GROUP = get(ENV, "GROUP", "All")
         # In the other case, the transpose should be taken to compute the QR decomposition.
         for (rows, cols) in [(5, 3), (3, 5)]
             v = orthogonal(rng, rows, cols)
-            CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
+            CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) :
+                              (@test v' * v ≈ I(cols))
         end
         for mat in [(3, 4, 5), (2, 2, 5)]
             v = orthogonal(rng, mat...)
             cols = mat[end]
             rows = div(prod(mat), cols)
             v = reshape(v, (rows, cols))
-            CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols))
+            CUDA.@allowscalar rows < cols ? (@test v * v' ≈ I(rows)) :
+                              (@test v' * v ≈ I(cols))
         end
         # Type
         @testset "Orthogonal Types $T" for T in (Float32, Float64)#(Float16, Float32, Float64)
@@ -211,15 +214,15 @@ const GROUP = get(ENV, "GROUP", "All")
 
         @test_throws ArgumentError sparse_init(3, 4, 5, sparsity=0.1)
         @test_throws ArgumentError sparse_init(3, sparsity=0.1)
-        v = sparse_init(100, 100, sparsity=-0.1)
+        v = sparse_init(100, 100; sparsity=-0.1)
         @test sum(v .== 0) == 0
-        v = sparse_init(100, 100, sparsity=1.1)
+        v = sparse_init(100, 100; sparsity=1.1)
         @test sum(v .== 0) == length(v)
 
         for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)]
             expected_zeros = ceil(Integer, n_in * sparsity)
-            v = sparse_init(n_in, n_out, sparsity=sparsity, std=σ)
-            @test all([sum(v[:,col] .== 0) == expected_zeros for col in 1:n_out])
+            v = sparse_init(n_in, n_out; sparsity=sparsity, std=σ)
+            @test all([sum(v[:, col] .== 0) == expected_zeros for col in 1:n_out])
             @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ
         end
 
@@ -247,7 +250,7 @@ const GROUP = get(ENV, "GROUP", "All")
             @test eltype(cl(rng, 4, 2)) == Float32
         end
     end
-    
+
     @testset "identity_init" begin
         @testset "Non-identity sizes" begin
             @test identity_init(2, 3)[:, end] == zeros(Float32, 2)
@@ -266,4 +269,5 @@ const GROUP = get(ENV, "GROUP", "All")
     @testset "Aqua: Quality Assurance" begin
         Aqua.test_all(WeightInitializers; ambiguities=false)
         Aqua.test_ambiguities(WeightInitializers; recursive=false)
+    end
 end

From e9d29ef6277221970413e09e1b2a2b8219b899f6 Mon Sep 17 00:00:00 2001
From: MartinuzziFrancesco <martinuzzi.francesco@gmail.com>
Date: Sun, 25 Feb 2024 17:28:48 +0100
Subject: [PATCH 09/12] tidying up docstrings

---
 src/initializers.jl | 75 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 53 insertions(+), 22 deletions(-)

diff --git a/src/initializers.jl b/src/initializers.jl
index a35e6da..5a076ed 100644
--- a/src/initializers.jl
+++ b/src/initializers.jl
@@ -123,12 +123,17 @@ function truncated_normal(rng::AbstractRNG, ::Type{T}, dims::Integer...; mean=T(
 end
 
 """
-    orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...; gain = 1) where {T <: Real} -> AbstractArray{T, length(dims)}
-    orthogonal(rng::AbstractRNG; kw...) -> Function
+    orthogonal([::AbstractRNG=_default_rng()], [T=Float32], dims::Integer...;
+        gain = 1)  -> AbstractArray{T, length(dims)}
 
-Return an `AbstractArray{T}` of the given dimensions (`dims`) which is a (semi) orthogonal matrix, as described in [^Saxe14]
+Return an `AbstractArray{T}` of the given dimensions (`dims`) which is a
+(semi) orthogonal matrix, as described in [^Saxe14]
 
-The function constructs an orthogonal or semi-orthogonal matrix depending on the specified dimensions. For two dimensions, it returns a matrix where `dims = (rows, cols)`. For more than two dimensions, it computes an orthogonal matrix of size `prod(dims[1:(end - 1)])` by `dims[end]` before reshaping it to the original dimensions.
+The function constructs an orthogonal or semi-orthogonal matrix depending on the specified
+dimensions. For two dimensions, it returns a matrix where `dims = (rows, cols)`.
+For more than two dimensions, it computes an orthogonal matrix of
+size `prod(dims[1:(end - 1)])` by `dims[end]` before reshaping it to
+the original dimensions.
 
 Cannot construct a vector, i.e., `length(dims) == 1` is forbidden.
 
@@ -141,7 +146,9 @@ Cannot construct a vector, i.e., `length(dims) == 1` is forbidden.
 
 # References
 
-[^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120
+[^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of
+    learning in deep linear neural networks",
+    ICLR 2014, https://arxiv.org/abs/1312.6120
 """
 function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         gain::Number=T(1.0)) where {T <: Number}
@@ -170,10 +177,16 @@ function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...;
 end
 
 """
-    sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...; sparsity::Number, std::Number=0.01) where {T <: Number} -> AbstractArray{T}
+    sparse_init([::AbstractRNG=_default_rng()], [T=Float32], dims::Integer...;
+        sparsity::Number, std::Number=0.01) -> AbstractArray{T}
 
-Creates a sparsely initialized weight matrix with a specified proportion of zeroed elements, using random numbers drawn from a normal distribution for the non-zero elements. This method is introduced in [^Martens2010].
-Note: The sparsity parameter controls the proportion of the matrix that will be zeroed. For example, a sparsity of 0.3 means that approximately 30% of the elements will be set to zero. The non-zero elements are distributed according to a normal distribution, scaled by the std parameter.
+Creates a sparsely initialized weight matrix with a specified proportion of zeroed elements,
+using random numbers drawn from a normal distribution for the non-zero elements.
+This method is introduced in [^Martens2010].
+Note: The sparsity parameter controls the proportion of the matrix that will be zeroed.
+For example, a sparsity of 0.3 means that approximately 30% of the elements will be
+set to zero. The non-zero elements are distributed according to a normal distribution,
+scaled by the std parameter.
 
 # Arguments
 
@@ -181,11 +194,13 @@ Note: The sparsity parameter controls the proportion of the matrix that will be
   - `T::Type{<:Number}`: The numeric type of the elements in the returned array.
   - `dims::Integer...`: The dimensions of the weight matrix to be generated.
   - `sparsity::Number`: The proportion of elements to be zeroed. Must be between 0 and 1.
-  - `std::Number=0.01`: The standard deviation of the normal distribution before applying `gain`.
+  - `std::Number=0.01`: The standard deviation of the normal distribution
+    before applying `gain`.
 
 # Returns
 
-  - `AbstractArray{T}`: A sparsely initialized weight matrix of dimensions `dims` and type `T`.
+  - `AbstractArray{T}`: A sparsely initialized weight matrix of dimensions `dims`
+    and type `T`.
 
 # Examples
 
@@ -208,7 +223,9 @@ matrix = sparse_init(rng, Float32, 5, 5; sparsity=0.3, std=0.01)
 
 # References
 
-[^Martens2010] Martens, J, "Deep learning via Hessian-free optimization" _Proceedings of the 27th International Conference on International Conference on Machine Learning_. 2010.
+[^Martens2010] Martens, J, "Deep learning via Hessian-free optimization"
+    _Proceedings of the 27th International Conference on International Conference
+    on Machine Learning_. 2010.
 """
 function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         sparsity::Number, std::Number=T(0.01)) where {T <: Number}
@@ -225,33 +242,47 @@ function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
 end
 
 """
-    identity_init(rng::AbstractRNG, ::Type{T}, size...; gain::Number=1, shift::Union{Integer, Tuple{Integer, Integer}}=0) where {T <: Number} -> AbstractArray{T}
+    identity_init([::AbstractRNG=_default_rng()], [T=Float32], size...; gain::Number=1,
+        shift::Union{Integer, Tuple{Integer, Integer}}=0) -> AbstractArray{T}
 
-Constructs an array that aims to provide an identity mapping when used as parameters in most layers of a neural network. The identity mapping is scaled by the `gain` parameter.
+Constructs an array that aims to provide an identity mapping when used as parameters in
+most layers of a neural network. The identity mapping is scaled by the `gain` parameter.
 
 # Behavior
 
-  - 1D: Returns a `Vector` of zeros (useful for biases in layers where `input_size == output_size`).
-  - 2D: Returns an identity matrix (useful for fully connected layers with equal input and output sizes).
-  - More than 2D: Returns a tensor where the central slice along the last two dimensions is an identity matrix, and the rest are zeros (useful for convolutional layers, simulating an identity convolution).
+  - 1D: Returns a `Vector` of zeros (useful for biases in layers where
+    `input_size == output_size`).
+  - 2D: Returns an identity matrix
+    (useful for fully connected layers with equal input and output sizes).
+  - More than 2D: Returns a tensor where the central slice along the last
+    two dimensions is an identity matrix, and the rest are zeros
+    (useful for convolutional layers, simulating an identity convolution).
 
 # Caveats
 
-  - Not all layers will result in an identity mapping when using this initializer. Exceptions include recurrent and normalization layers.
-  - Layers must have `input_size == output_size` for a perfect identity mapping. In cases where this condition is not met, the function pads extra dimensions with zeros.
-  - For convolutional layers to achieve an identity mapping, kernel sizes must be odd, and appropriate padding must be applied to ensure the output feature maps are the same size as the input feature maps.
+  - Not all layers will result in an identity mapping when using this initializer.
+    Exceptions include recurrent and normalization layers.
+  - Layers must have `input_size == output_size` for a perfect identity mapping.
+    In cases where this condition is not met, the function pads extra dimensions with zeros.
+  - For convolutional layers to achieve an identity mapping, kernel sizes must be odd,
+    and appropriate padding must be applied to ensure the output
+    feature maps are the same size as the input feature maps.
 
 # Arguments
 
-  - `rng::AbstractRNG`: An optional random number generator, included for consistency with other initializers but ignored since the output is deterministic.
+  - `rng::AbstractRNG`: An optional random number generator,
+    included for consistency with other initializers but ignored since the
+    output is deterministic.
   - `T::Type{<:Number}`: The numeric type of the array elements.
   - `size...`: The dimensions of the array to be initialized.
   - `gain::Number=1`: A scaling factor applied to the identity mapping.
-  - `shift::Union{Integer, Tuple{Integer, Integer}}=0`: An integer or a tuple specifying the circular shift applied to the output array.
+  - `shift::Union{Integer, Tuple{Integer, Integer}}=0`: An integer or
+    a tuple specifying the circular shift applied to the output array.
 
 # Returns
 
-  - `AbstractArray{T}`: An array initialized to represent an identity mapping, scaled by `gain` and optionally shifted by `shift`.
+  - `AbstractArray{T}`: An array initialized to represent an identity mapping,
+    scaled by `gain` and optionally shifted by `shift`.
 
 # Examples
 

From 214256fd731d8c3568c8e779f168579f269f8801 Mon Sep 17 00:00:00 2001
From: MartinuzziFrancesco <martinuzzi.francesco@gmail.com>
Date: Sun, 25 Feb 2024 17:34:03 +0100
Subject: [PATCH 10/12] format

---
 src/initializers.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/initializers.jl b/src/initializers.jl
index 5a076ed..357b41c 100644
--- a/src/initializers.jl
+++ b/src/initializers.jl
@@ -147,8 +147,8 @@ Cannot construct a vector, i.e., `length(dims) == 1` is forbidden.
 # References
 
 [^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of
-    learning in deep linear neural networks",
-    ICLR 2014, https://arxiv.org/abs/1312.6120
+learning in deep linear neural networks",
+ICLR 2014, https://arxiv.org/abs/1312.6120
 """
 function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         gain::Number=T(1.0)) where {T <: Number}
@@ -224,8 +224,8 @@ matrix = sparse_init(rng, Float32, 5, 5; sparsity=0.3, std=0.01)
 # References
 
 [^Martens2010] Martens, J, "Deep learning via Hessian-free optimization"
-    _Proceedings of the 27th International Conference on International Conference
-    on Machine Learning_. 2010.
+_Proceedings of the 27th International Conference on International Conference
+on Machine Learning_. 2010.
 """
 function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         sparsity::Number, std::Number=T(0.01)) where {T <: Number}

From 1946a556e21e6c0f2fa96fa3245691353126006f Mon Sep 17 00:00:00 2001
From: MartinuzziFrancesco <martinuzzi.francesco@gmail.com>
Date: Mon, 26 Feb 2024 11:27:11 +0100
Subject: [PATCH 11/12] import fixes, adding inits to non-diffs list

---
 src/WeightInitializers.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/WeightInitializers.jl b/src/WeightInitializers.jl
index b2db3cb..ad739bb 100644
--- a/src/WeightInitializers.jl
+++ b/src/WeightInitializers.jl
@@ -1,10 +1,9 @@
 module WeightInitializers
 
 import PrecompileTools: @recompile_invalidations
-using PartialFunctions, Random, SpecialFunctions, Statistics, LinearAlgebra
 
 @recompile_invalidations begin
-    using ChainRulesCore, PartialFunctions, Random, SpecialFunctions, Statistics
+    using ChainRulesCore, PartialFunctions, Random, SpecialFunctions, Statistics, LinearAlgebra
 end
 
 include("utils.jl")
@@ -15,7 +14,8 @@ for f in [
     :zeros64, :ones64, :rand64, :randn64, :zeros32, :ones32, :rand32, :randn32, :zeros16,
     :ones16, :rand16, :randn16, :zerosC64, :onesC64, :randC64, :randnC64, :zerosC32,
     :onesC32, :randC32, :randnC32, :zerosC16, :onesC16, :randC16, :randnC16, :glorot_normal,
-    :glorot_uniform, :kaiming_normal, :kaiming_uniform, :truncated_normal]
+    :glorot_uniform, :kaiming_normal, :kaiming_uniform, :truncated_normal, :orthogonal,
+    :sparse_init, :identity_init]
     @eval @non_differentiable $(f)(::Any...)
 end
 

From 2af0c1871e6b51795db2a8437f56f115b3a57ab9 Mon Sep 17 00:00:00 2001
From: MartinuzziFrancesco <martinuzzi.francesco@gmail.com>
Date: Mon, 26 Feb 2024 11:29:31 +0100
Subject: [PATCH 12/12] format

---
 src/WeightInitializers.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/WeightInitializers.jl b/src/WeightInitializers.jl
index ad739bb..26b05eb 100644
--- a/src/WeightInitializers.jl
+++ b/src/WeightInitializers.jl
@@ -3,7 +3,8 @@ module WeightInitializers
 import PrecompileTools: @recompile_invalidations
 
 @recompile_invalidations begin
-    using ChainRulesCore, PartialFunctions, Random, SpecialFunctions, Statistics, LinearAlgebra
+    using ChainRulesCore, PartialFunctions, Random, SpecialFunctions, Statistics,
+          LinearAlgebra
 end
 
 include("utils.jl")