chengchingwen · pevnak · Feb 13, 2024
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "NeuralAttentionlib"
 uuid = "12afc1b8-fad6-47e1-9132-84abc478905f"
 authors = ["chengchingwen <adgjl5645@hotmail.com>"]
-version = "0.2.12"
+version = "0.2.13"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -11,21 +11,19 @@ GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
 
 [compat]
-Adapt = "3.3"
-CUDA = "3, 4"
+Adapt = "4"
+CUDA = "5"
 ChainRulesCore = "1.3"
-GPUArrays = "8"
+GPUArrays = "8, 9, 10"
 GPUArraysCore = "0.1"
-NNlib = "0.7, 0.8"
-NNlibCUDA = "0.2"
+NNlib = "0.7, 0.8, 0.9"
 Requires = "1.1"
 Static = "0.7, 0.8"
-julia = "1.6"
+julia = "1.8"
 
 [extras]
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"

diff --git a/src/NeuralAttentionlib.jl b/src/NeuralAttentionlib.jl
@@ -9,7 +9,6 @@ import GPUArraysCore
 using ChainRulesCore
 
 using NNlib
-using NNlibCUDA
 
 using Requires
 

diff --git a/src/functional/layernorm.jl b/src/functional/layernorm.jl
@@ -19,16 +19,18 @@ _x_y2(x, y) = (x, x * y)
 function _normalize(inN::T, ϵ::T, x::T, mean_M2_::Tuple{Int32, T, T}) where T
     _, μ, M2 = mean_M2_
     v = M2 * inN
-    σ₀ = sqrt(v)
-    σ = max(σ₀, ϵ)
+    # σ₀ = sqrt(v .+ ϵ)
+    # σ = max(σ₀, ϵ)
+    σ = sqrt(v .+ ϵ)
     return (x - μ) / σ
 end
 
 function _normalize(inN::Float16, ϵ::Float16, x::Float16, mean_M2_::Tuple{Int32, Float32, Float32})
     _, μ, M2 = mean_M2_
     v = Float16(M2) * inN
-    σ₀ = sqrt(v)
-    σ = max(σ₀, ϵ)
+    # σ₀ = sqrt(v)
+    # σ = max(σ₀, ϵ)
+    σ = sqrt(v .+ ϵ)
     return Float16(Float32(x) - μ) / σ
 end
 

diff --git a/src/mask/mask.jl b/src/mask/mask.jl
@@ -138,7 +138,7 @@ Base.@propagate_inbounds Base.getindex(m::M, I::Integer...) where {M <: Union{<:
 Base.@propagate_inbounds Base.getindex(m::MaskIndexer, i::CartesianIndex) = m[Tuple(i)]
 Base.@propagate_inbounds Base.getindex(m::MaskIndexer, I::Tuple) = m[I...]
 
-Adapt.adapt(to::CUDA.Adaptor, m::AbstractArrayMask) = Indexer{typeof(m)}(map(Base.Fix1(Adapt.adapt, to), GetIndexer(m).__fields))
+Adapt.adapt(to::CUDA.KernelAdaptor, m::AbstractArrayMask) = Indexer{typeof(m)}(map(Base.Fix1(Adapt.adapt, to), GetIndexer(m).__fields))
 
 randomness(::AbstractMask) = static(false)
 require_dest(::AbstractMask) = static(false)
diff --git a/src/mask/wrapper.jl b/src/mask/wrapper.jl
@@ -9,7 +9,7 @@ AttenMask(m::FlipMask) = FlipMask(AttenMask(m.mask))
 Base.:!(m::AbstractMask) = FlipMask(m)
 Base.:!(m::FlipMask) = m.mask
 
-Adapt.adapt(to::CUDA.Adaptor, m::FlipMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask),))
+Adapt.adapt(to::CUDA.KernelAdaptor, m::FlipMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask),))
 adapt_structure(to, x::FlipMask) = FlipMask(adapt(to, x.mask))
 GetIndexer(m::FlipMask, dest_size = nothing) = Indexer{typeof(m)}((mask = GetIndexer(m.mask, dest_size),), dest_size)
 
@@ -43,7 +43,7 @@ Base.:|(::Nothing, m::AbstractMask) = nothing
 Base.:&(m::AbstractMask, ::Nothing) = m
 Base.:&(::Nothing, m::AbstractMask) = m
 
-Adapt.adapt(to::CUDA.Adaptor, m::CombinedMask) = Indexer{typeof(m)}((f = adapt(to, m.f),
+Adapt.adapt(to::CUDA.KernelAdaptor, m::CombinedMask) = Indexer{typeof(m)}((f = adapt(to, m.f),
                                                                      masks = map(Base.Fix1(adapt, to), m.masks)))
 adapt_structure(to, x::CombinedMask) = CombinedMask(x.f, adapt(to, x.masks))
 GetIndexer(m::CombinedMask, dest_size = nothing) = Indexer{typeof(m)}((m.f, masks = map(Base.Fix2(GetIndexer, dest_size), m.masks)))
@@ -101,7 +101,7 @@ function BatchedMask(mask)
     return BatchedMask(mask, batch_dim)
 end
 
-Adapt.adapt(to::CUDA.Adaptor, m::BatchedMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask), batch_dim = static(m.batch_dim)))
+Adapt.adapt(to::CUDA.KernelAdaptor, m::BatchedMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask), batch_dim = static(m.batch_dim)))
 adapt_structure(to, x::BatchedMask) = BatchedMask(adapt(to, x.mask), x.batch_dim)
 GetIndexer(m::BatchedMask, dest_size = nothing) = Indexer{typeof(m)}((mask = GetIndexer(m.mask, dest_size), batch_dim = static(m.batch_dim)))
 
@@ -138,7 +138,7 @@ end
 
 AttenMask(r::RepeatMask) = RepeatMask(AttenMask(r.mask), r.num)
 
-Adapt.adapt(to::CUDA.Adaptor, m::RepeatMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask), num = m.num))
+Adapt.adapt(to::CUDA.KernelAdaptor, m::RepeatMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask), num = m.num))
 adapt_structure(to, x::RepeatMask) = RepeatMask(adapt(to, x.mask), x.num)
 GetIndexer(m::RepeatMask, dest_size = nothing) = Indexer{typeof(m)}((mask = GetIndexer(m.mask, dest_size), num = m.num))
 
@@ -176,7 +176,7 @@ struct BiSequenceMask{QM<:AbstractMask, KM<:AbstractMask} <: AbstractWrapperMask
     k_mask::KM
 end
 
-Adapt.adapt(to::CUDA.Adaptor, m::BiSequenceMask) = Indexer{typeof(m)}((q_mask = adapt(to, m.q_mask), k_mask = adapt(to, m.k_mask)))
+Adapt.adapt(to::CUDA.KernelAdaptor, m::BiSequenceMask) = Indexer{typeof(m)}((q_mask = adapt(to, m.q_mask), k_mask = adapt(to, m.k_mask)))
 adapt_structure(to, x::BiSequenceMask) = BiSequenceMask(adapt(to, x.q_mask), adapt(to, x.k_mask))
 
 bi_dest_size(::Nothing, is_q) = nothing