diff --git a/Project.toml b/Project.toml index 93c5d95..64d1f21 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "NeuralAttentionlib" uuid = "12afc1b8-fad6-47e1-9132-84abc478905f" authors = ["chengchingwen "] -version = "0.2.12" +version = "0.2.13" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" @@ -11,21 +11,19 @@ GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" -NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d" Requires = "ae029012-a4dd-5104-9daa-d747884805df" Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3" [compat] -Adapt = "3.3" -CUDA = "3, 4" +Adapt = "4" +CUDA = "5" ChainRulesCore = "1.3" -GPUArrays = "8" +GPUArrays = "8, 9, 10" GPUArraysCore = "0.1" -NNlib = "0.7, 0.8" -NNlibCUDA = "0.2" +NNlib = "0.7, 0.8, 0.9" Requires = "1.1" Static = "0.7, 0.8" -julia = "1.6" +julia = "1.8" [extras] ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" diff --git a/src/NeuralAttentionlib.jl b/src/NeuralAttentionlib.jl index 8d40af9..8827d94 100644 --- a/src/NeuralAttentionlib.jl +++ b/src/NeuralAttentionlib.jl @@ -9,7 +9,6 @@ import GPUArraysCore using ChainRulesCore using NNlib -using NNlibCUDA using Requires diff --git a/src/functional/layernorm.jl b/src/functional/layernorm.jl index 312a74d..75550d3 100644 --- a/src/functional/layernorm.jl +++ b/src/functional/layernorm.jl @@ -19,16 +19,18 @@ _x_y2(x, y) = (x, x * y) function _normalize(inN::T, ϵ::T, x::T, mean_M2_::Tuple{Int32, T, T}) where T _, μ, M2 = mean_M2_ v = M2 * inN - σ₀ = sqrt(v) - σ = max(σ₀, ϵ) + # σ₀ = sqrt(v .+ ϵ) + # σ = max(σ₀, ϵ) + σ = sqrt(v .+ ϵ) return (x - μ) / σ end function _normalize(inN::Float16, ϵ::Float16, x::Float16, mean_M2_::Tuple{Int32, Float32, Float32}) _, μ, M2 = mean_M2_ v = Float16(M2) * inN - σ₀ = sqrt(v) - σ = max(σ₀, ϵ) + # σ₀ = sqrt(v) + # σ = max(σ₀, ϵ) + σ = sqrt(v .+ ϵ) return Float16(Float32(x) - μ) / σ end diff --git a/src/mask/mask.jl b/src/mask/mask.jl index afc2371..d32c989 100644 --- a/src/mask/mask.jl +++ b/src/mask/mask.jl @@ -138,7 +138,7 @@ Base.@propagate_inbounds Base.getindex(m::M, I::Integer...) where {M <: Union{<: Base.@propagate_inbounds Base.getindex(m::MaskIndexer, i::CartesianIndex) = m[Tuple(i)] Base.@propagate_inbounds Base.getindex(m::MaskIndexer, I::Tuple) = m[I...] -Adapt.adapt(to::CUDA.Adaptor, m::AbstractArrayMask) = Indexer{typeof(m)}(map(Base.Fix1(Adapt.adapt, to), GetIndexer(m).__fields)) +Adapt.adapt(to::CUDA.KernelAdaptor, m::AbstractArrayMask) = Indexer{typeof(m)}(map(Base.Fix1(Adapt.adapt, to), GetIndexer(m).__fields)) randomness(::AbstractMask) = static(false) require_dest(::AbstractMask) = static(false) diff --git a/src/mask/wrapper.jl b/src/mask/wrapper.jl index 0b0e3aa..f04ae33 100644 --- a/src/mask/wrapper.jl +++ b/src/mask/wrapper.jl @@ -9,7 +9,7 @@ AttenMask(m::FlipMask) = FlipMask(AttenMask(m.mask)) Base.:!(m::AbstractMask) = FlipMask(m) Base.:!(m::FlipMask) = m.mask -Adapt.adapt(to::CUDA.Adaptor, m::FlipMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask),)) +Adapt.adapt(to::CUDA.KernelAdaptor, m::FlipMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask),)) adapt_structure(to, x::FlipMask) = FlipMask(adapt(to, x.mask)) GetIndexer(m::FlipMask, dest_size = nothing) = Indexer{typeof(m)}((mask = GetIndexer(m.mask, dest_size),), dest_size) @@ -43,7 +43,7 @@ Base.:|(::Nothing, m::AbstractMask) = nothing Base.:&(m::AbstractMask, ::Nothing) = m Base.:&(::Nothing, m::AbstractMask) = m -Adapt.adapt(to::CUDA.Adaptor, m::CombinedMask) = Indexer{typeof(m)}((f = adapt(to, m.f), +Adapt.adapt(to::CUDA.KernelAdaptor, m::CombinedMask) = Indexer{typeof(m)}((f = adapt(to, m.f), masks = map(Base.Fix1(adapt, to), m.masks))) adapt_structure(to, x::CombinedMask) = CombinedMask(x.f, adapt(to, x.masks)) GetIndexer(m::CombinedMask, dest_size = nothing) = Indexer{typeof(m)}((m.f, masks = map(Base.Fix2(GetIndexer, dest_size), m.masks))) @@ -101,7 +101,7 @@ function BatchedMask(mask) return BatchedMask(mask, batch_dim) end -Adapt.adapt(to::CUDA.Adaptor, m::BatchedMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask), batch_dim = static(m.batch_dim))) +Adapt.adapt(to::CUDA.KernelAdaptor, m::BatchedMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask), batch_dim = static(m.batch_dim))) adapt_structure(to, x::BatchedMask) = BatchedMask(adapt(to, x.mask), x.batch_dim) GetIndexer(m::BatchedMask, dest_size = nothing) = Indexer{typeof(m)}((mask = GetIndexer(m.mask, dest_size), batch_dim = static(m.batch_dim))) @@ -138,7 +138,7 @@ end AttenMask(r::RepeatMask) = RepeatMask(AttenMask(r.mask), r.num) -Adapt.adapt(to::CUDA.Adaptor, m::RepeatMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask), num = m.num)) +Adapt.adapt(to::CUDA.KernelAdaptor, m::RepeatMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask), num = m.num)) adapt_structure(to, x::RepeatMask) = RepeatMask(adapt(to, x.mask), x.num) GetIndexer(m::RepeatMask, dest_size = nothing) = Indexer{typeof(m)}((mask = GetIndexer(m.mask, dest_size), num = m.num)) @@ -176,7 +176,7 @@ struct BiSequenceMask{QM<:AbstractMask, KM<:AbstractMask} <: AbstractWrapperMask k_mask::KM end -Adapt.adapt(to::CUDA.Adaptor, m::BiSequenceMask) = Indexer{typeof(m)}((q_mask = adapt(to, m.q_mask), k_mask = adapt(to, m.k_mask))) +Adapt.adapt(to::CUDA.KernelAdaptor, m::BiSequenceMask) = Indexer{typeof(m)}((q_mask = adapt(to, m.q_mask), k_mask = adapt(to, m.k_mask))) adapt_structure(to, x::BiSequenceMask) = BiSequenceMask(adapt(to, x.q_mask), adapt(to, x.k_mask)) bi_dest_size(::Nothing, is_q) = nothing