compintell · willtebbutt · Sep 12, 2024 · Sep 12, 2024 · Sep 13, 2024 · Sep 13, 2024
diff --git a/Project.toml b/Project.toml
@@ -22,13 +22,17 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8"
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
 LogDensityProblemsAD = "996a588d-648d-4e1f-a8f0-a84b347e47b1"
+LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 
 [extensions]
 MooncakeCUDAExt = "CUDA"
 MooncakeDynamicPPLExt = "DynamicPPL"
 MooncakeJETExt = "JET"
 MooncakeLogDensityProblemsADExt = "LogDensityProblemsAD"
+MooncakeLuxLibExt = "LuxLib"
+MooncakeNNlibExt = "NNlib"
 MooncakeSpecialFunctionsExt = "SpecialFunctions"
 
 [compat]
@@ -46,7 +50,9 @@ FillArrays = "1"
 Graphs = "1"
 JET = "0.9"
 LogDensityProblemsAD = "1"
+LuxLib = "1.2"
 MistyClosures = "1"
+NNlib = "0.9"
 PDMats = "0.11"
 Setfield = "1"
 SpecialFunctions = "2"
@@ -66,11 +72,14 @@ FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
 KernelFunctions = "ec8451be-7e33-11e9-00cf-bbf324bd1392"
 LogDensityProblemsAD = "996a588d-648d-4e1f-a8f0-a84b347e47b1"
+Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 TemporalGPs = "e155a3c4-0841-43e1-8b83-a0e4f03cc18f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["AbstractGPs", "BenchmarkTools", "CUDA", "DiffTests", "Distributions", "Documenter", "DynamicPPL", "FillArrays", "KernelFunctions", "JET", "LogDensityProblemsAD", "PDMats", "SpecialFunctions", "StableRNGs", "Test", "TemporalGPs"]
+test = ["AbstractGPs", "BenchmarkTools", "CUDA", "DiffTests", "Distributions", "Documenter", "DynamicPPL", "FillArrays", "KernelFunctions", "JET", "LogDensityProblemsAD", "Lux", "LuxLib", "NNlib", "PDMats", "SpecialFunctions", "StableRNGs", "Test", "TemporalGPs"]
diff --git a/docs/make.jl b/docs/make.jl
@@ -32,9 +32,12 @@ makedocs(
             "Algorithmic Differentiation" => "algorithmic_differentiation.md",
             "Mooncake.jl's Rule System" => "mathematical_interpretation.md",
         ],
+        "Utilities" => [
+            "Tools for Rules" => "tools_for_rules.md",
+            "Debug Mode" => "debug_mode.md",
+            "Debugging and MWEs" => "debugging_and_mwes.md",
+        ],
         "Known Limitations" => "known_limitations.md",
-        "Debug Mode" => "debug_mode.md",
-        "Debugging and MWEs" => "debugging_and_mwes.md",
     ]
 )
 

diff --git a/docs/src/tools_for_rules.md b/docs/src/tools_for_rules.md
@@ -0,0 +1,50 @@
+# Tools for Rules
+
+Most of the time, Mooncake.jl can just differentiate your code, but you will need to intervene if you make use of a language feature which is unsupported.
+However, this does not always necessitate writing your own `rrule!!` from scratch.
+In this section, we detail some useful strategies which can help you avoid having to write `rrule!!`s in many situations.
+
+## Simplfiying Code via Overlays
+
+Suppose you have a function
+```julia
+foo(x::Float64) = bar(x)
+```
+where Mooncake.jl fails to differentiate `bar` for some reason.
+If you have access to another function `baz`, which does the same thing as `bar`, but does so in a way which Mooncake.jl can differentiate, you can simply write:
+```julia
+Base.Experimental.@overlay Mooncake.mooncake_method_table foo(x::Float64) = baz(x)
+```
+When looking up the code for `foo(::Float64)`, Mooncake.jl will see this method, rather than the original, and should successfully differentiate it.
+If you search for `@overlay` in the Mooncake.jl source code, you will see a variety of instances where this is used in practice.
+
+This approach is often very straightforward, and we recommend you try this first before going down the path of writing rules.
+
+## Functions with Zero Derivative
+
+If the above strategy does not work, but you find yourself in the surprisingly common situation that the derivative of your function is always zero, you can very straightforwardly write a rule by making use of the following:
+```@docs
+Mooncake.simple_zero_adjoint
+```
+Suppose you have a function `foo(x, y, z)` whose derivative is zero, you would write an `rrule!!` as follows:
+```julia
+function Mooncake.rrule!!(f::CoDual{typeof(foo)}, x::CoDual, y::CoDual, z::CoDual)
+    return Mooncake.simple_zero_adjoint(f, x, y, z)
+end
+```
+Users of ChainRules.jl should be familiar with this functionality -- it is morally the same as `ChainRulesCore.@non_differentiable`.
+This approach is utilised often in Mooncake.jl's codebase.
+
+## Using ChainRules.jl
+
+[ChainRules.jl](https://github.com/JuliaDiff/ChainRules.jl) provides a large number of rules for differentiating functions in reverse-mode.
+These rules are methods of the `ChainRulesCore.rrule` function.
+There are some instances where there is it most convenient to implement a `Mooncake.rrule!!` by wrapping an existing `ChainRulesCore.rrule`.
+
+There is enough similarity between these two systems that most of the boilerplate code can be avoided.
+The docstrings below explain this functionality, and how it should / should not be used.
+
+```@docs
+Mooncake.@from_rrule
+Mooncake.rrule_wrapper
+```
diff --git a/ext/MooncakeLuxLibExt.jl b/ext/MooncakeLuxLibExt.jl
@@ -0,0 +1,176 @@
+module MooncakeLuxLibExt
+
+using LuxLib, Random, Mooncake
+using Base: IEEEFloat
+using Base.Experimental: @overlay
+
+import LuxLib: Impl
+import LuxLib.Utils: static_training_mode_check
+import Mooncake: @from_rrule, DefaultCtx, MooncakeInterpreter, mooncake_method_table, CoDual
+
+@from_rrule(DefaultCtx, Tuple{typeof(Impl.matmul), Array{P}, Array{P}} where {P<:IEEEFloat})
+@from_rrule(
+    DefaultCtx,
+    Tuple{typeof(Impl.matmuladd), Array{P}, Array{P}, Vector{P}} where {P<:IEEEFloat},
+)
+@from_rrule(
+    DefaultCtx,
+    Tuple{typeof(Impl.batched_matmul), Array{P, 3}, Array{P, 3}} where {P<:IEEEFloat},
+)
+
+# Re-implement a bunch of methods to ensure that Mooncake can differentiate them.
+@overlay mooncake_method_table function LuxLib.Impl.fused_dense(
+    opmode,
+    act::F,
+    weight::AbstractMatrix,
+    x::AbstractMatrix,
+    b::LuxLib.Optional{<:AbstractVector},
+) where {F}
+    return bias_activation(act, Impl.matmul(weight, x), b)
+end
+
+@overlay mooncake_method_table function LuxLib.Impl.bias_activation_loop!(
+    y::AbstractArray{yT, 3}, σ::F, x::AbstractArray{xT, 3}, bias::AbstractVector
+) where {F, xT, yT}
+    return LuxLib.Impl.bias_activation_simd_loop!(y, σ, x, bias)
+end
+
+@overlay mooncake_method_table function LuxLib.Impl.activation_loop!(
+    y::AbstractArray, σ::F, x::AbstractArray
+) where {F}
+    return LuxLib.Impl.activation_simd_loop!(y, σ, x)
+end
+
+@overlay mooncake_method_table function LuxLib.Impl.fused_conv(
+    ::LuxLib.Impl.AbstractInternalArrayOpMode,
+    act::F,
+    weight::AbstractArray{wT, N},
+    x::AbstractArray{xT, N},
+    bias::LuxLib.Optional{<:AbstractVector},
+    cdims::LuxLib.Impl.ConvDims,
+) where {F, wT, xT, N}
+    return LuxLib.Impl.bias_activation(act, LuxLib.Impl.conv(x, weight, cdims), bias)
+end
+
+for f in [
+    Impl.SLEEFActivations.sigmoid_fast,
+    Impl.SLEEFActivations.softplus,
+    Impl.SLEEFActivations.logsigmoid,
+    Impl.SLEEFActivations.swish,
+    Impl.SLEEFActivations.lisht,
+    Impl.SLEEFActivations.tanh,
+    Impl.SLEEFActivations.tanh_fast,
+]
+    @from_rrule DefaultCtx Tuple{typeof(f), IEEEFloat}
+    @from_rrule(
+        DefaultCtx,
+        Tuple{typeof(Broadcast.broadcasted), typeof(f), Union{IEEEFloat, Array{<:IEEEFloat}}},
+    )
+end
+
+Mooncake.@is_primitive(DefaultCtx, Tuple{typeof(static_training_mode_check), Vararg})
+function Mooncake.rrule!!(f::CoDual{typeof(static_training_mode_check)}, x::CoDual...)
+    return Mooncake.simple_zero_adjoint(f, x...)
+end
+
+
+
+
+# This is a really horrible hack that we need to do until Mooncake is able to support the
+# call-back-into-ad interface that ChainRules exposes.
+
+import LuxLib.Impl:
+    safe_eltype,
+    batchnorm_affine_normalize_internal,
+    batchnorm_affine_normalize_internal!,
+    ∇batchnorm_affine_normalize,
+    AbstractInternalArrayOpMode
+
+import ChainRulesCore as CRC
+
+function CRC.rrule(
+    ::typeof(batchnorm_affine_normalize_internal),
+    opmode::AbstractInternalArrayOpMode,
+    ::typeof(identity),
+    x::AbstractArray{T, N},
+    μ::AbstractVector,
+    σ²::AbstractVector,
+    γ::LuxLib.Optional{<:AbstractVector},
+    β::LuxLib.Optional{<:AbstractVector},
+    ϵ::Real,
+) where {T, N}
+    y = similar(
+        x,
+        promote_type(
+            safe_eltype(x), safe_eltype(μ), safe_eltype(σ²), safe_eltype(γ), safe_eltype(β)
+        )
+    )
+    γ′ = similar(
+        x, promote_type(safe_eltype(γ), safe_eltype(σ²), safe_eltype(ϵ)), size(x, N - 1)
+    )
+
+    batchnorm_affine_normalize_internal!(y, opmode, identity, x, μ, σ², γ, β, ϵ, γ′)
+
+    𝒫x, 𝒫μ, 𝒫σ² = CRC.ProjectTo(x), CRC.ProjectTo(μ), CRC.ProjectTo(σ²)
+    𝒫γ = γ === nothing ? identity : CRC.ProjectTo(γ)
+    𝒫β = β === nothing ? identity : CRC.ProjectTo(β)
+
+    ∇batchnorm_affine_normalize_internal = LuxLib.Impl.@closure Δ -> begin
+        ∂x, ∂μ, ∂σ², ∂γ, ∂β = ∇batchnorm_affine_normalize(opmode, Δ, x, μ, σ², γ, β, ϵ, γ′)
+        ∂∅ = CRC.NoTangent()
+        return ∂∅, ∂∅, ∂∅, 𝒫x(∂x), 𝒫μ(∂μ), 𝒫σ²(∂σ²), 𝒫γ(∂γ), 𝒫β(∂β), ∂∅
+    end
+
+    return y, ∇batchnorm_affine_normalize_internal
+end
+
+@from_rrule(
+    DefaultCtx,
+    Tuple{
+        typeof(batchnorm_affine_normalize_internal),
+        AbstractInternalArrayOpMode,
+        typeof(identity),
+        AbstractArray,
+        AbstractVector,
+        AbstractVector,
+        LuxLib.Optional{<:AbstractVector},
+        LuxLib.Optional{<:AbstractVector},
+        Real,
+    },
+)
+
+@overlay mooncake_method_table function batchnorm_affine_normalize_internal(
+    opmode::LuxLib.AbstractInternalArrayOpMode,
+    act::F,
+    x::AbstractArray{xT, 3},
+    μ::AbstractVector,
+    σ²::AbstractVector,
+    γ::Union{Nothing, AbstractVector},
+    β::Union{Nothing, AbstractVector},
+    ϵ::Real,
+) where {F, xT}
+    y = batchnorm_affine_normalize_internal(opmode, identity, x, μ, σ², γ, β, ϵ)
+    LuxLib.Impl.activation!(y, opmode, act, y)
+    return y
+end
+
+@overlay mooncake_method_table function batchnorm_affine_normalize_internal(
+    opmode::LuxLib.AbstractInternalArrayOpMode,
+    ::typeof(identity),
+    x::AbstractArray{xT, 3},
+    μ::AbstractVector,
+    σ²::AbstractVector,
+    γ::Union{Nothing, AbstractVector},
+    β::Union{Nothing, AbstractVector},
+    ϵ::Real,
+) where {xT}
+    y = similar(x,
+        promote_type(
+            safe_eltype(x), safe_eltype(μ), safe_eltype(σ²), safe_eltype(γ), safe_eltype(β)
+        )
+    )
+    batchnorm_affine_normalize_internal!(y, opmode, identity, x, μ, σ², γ, β, ϵ)
+    return y
+end
+
+end
diff --git a/ext/MooncakeNNlibExt.jl b/ext/MooncakeNNlibExt.jl
@@ -0,0 +1,65 @@
+module MooncakeNNlibExt
+
+    using NNlib, Random, Mooncake
+    using Base: IEEEFloat
+    using NNlib: dropout
+
+    using NNlib: conv, depthwiseconv
+    import Mooncake: @from_rrule, DefaultCtx, MinimalCtx
+
+    @from_rrule(
+        MinimalCtx,
+        Tuple{typeof(batched_mul), Array{P, 3}, Array{P, 3}} where {P<:IEEEFloat},
+    )
+    @from_rrule(
+        MinimalCtx,
+        Tuple{typeof(dropout), AbstractRNG, Array{P}, P} where {P<:IEEEFloat},
+        true,
+    )
+    @from_rrule(MinimalCtx, Tuple{typeof(softmax), Array{<:IEEEFloat}}, true)
+    @from_rrule(MinimalCtx, Tuple{typeof(logsoftmax), Array{<:IEEEFloat}}, true)
+    @from_rrule(MinimalCtx, Tuple{typeof(logsumexp), Array{<:IEEEFloat}}, true)
+    @from_rrule(
+        MinimalCtx,
+        Tuple{typeof(upsample_nearest), Array{<:IEEEFloat}, NTuple{N, Int} where {N}},
+    )
+    @from_rrule(
+        MinimalCtx,
+        Tuple{
+            typeof(NNlib.fold), Array{<:IEEEFloat}, NTuple{N, Int} where {N}, DenseConvDims,
+        },
+    )
+    @from_rrule(
+        MinimalCtx, Tuple{typeof(NNlib.unfold), Array{<:IEEEFloat}, DenseConvDims}
+    )
+    @from_rrule(
+        MinimalCtx,
+        Tuple{typeof(NNlib.scatter), Any, Array, Array{<:Union{Integer, Tuple}}},
+        true,
+    )
+    for conv in [:conv, :depthwiseconv]
+        local ∇conv_data, ∇conv_filter = Symbol.(:∇, conv, [:_data, :_filter])
+
+        @eval @from_rrule(
+            MinimalCtx,
+            Tuple{typeof($conv), Array{P}, Array{P}, ConvDims} where {P<:IEEEFloat},
+            true,
+        )
+        @eval @from_rrule(
+            MinimalCtx,
+            Tuple{typeof($∇conv_data), Array{P}, Array{P}, ConvDims} where {P<:IEEEFloat},
+            true,
+        )
+    end
+    @eval @from_rrule(
+        MinimalCtx,
+        Tuple{typeof(∇conv_filter), Array{P}, Array{P}, ConvDims} where {P<:IEEEFloat},
+        true,
+    )
+    for pool in [:maxpool, :meanpool]
+        @eval @from_rrule(
+            MinimalCtx, Tuple{typeof($pool), Array{<:IEEEFloat}, PoolDims}, true
+        )
+    end
+    @from_rrule(MinimalCtx, Tuple{typeof(pad_constant), Array, Any, Any}, true)
+end
diff --git a/src/Mooncake.jl b/src/Mooncake.jl
@@ -13,6 +13,7 @@ using
     Random,
     Setfield
 
+# There are many clashing names, so we will always qualify uses of names from CRC.
 import ChainRulesCore
 
 using Base:
@@ -85,7 +86,7 @@ include(joinpath("rrules", "misc.jl"))
 include(joinpath("rrules", "new.jl"))
 include(joinpath("rrules", "tasks.jl"))
 
-include("chain_rules_macro.jl")
+include("chain_rules_interop.jl")
 include("interface.jl")
 include("config.jl")