diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 1d4bc2929..c528a7402 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -83,6 +83,25 @@ steps: JULIA_AMDGPU_HIP_MUST_LOAD: "1" JULIA_AMDGPU_DISABLE_ARTIFACTS: "1" + - label: "Julia 1.10 Enzyme" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + - JuliaCI/julia-test#v1: + test_args: "enzyme" + agents: + queue: "juliagpu" + rocm: "*" + rocmgpu: "*" + if: build.message !~ /\[skip tests\]/ + command: "julia --project -e 'using Pkg; Pkg.update()'" + timeout_in_minutes: 180 + env: + JULIA_NUM_THREADS: 4 + JULIA_AMDGPU_CORE_MUST_LOAD: "1" + JULIA_AMDGPU_HIP_MUST_LOAD: "1" + JULIA_AMDGPU_DISABLE_ARTIFACTS: "1" + - label: "GPU-less environment" plugins: - JuliaCI/julia#v1: diff --git a/Project.toml b/Project.toml index fe3367731..7e8e04b86 100644 --- a/Project.toml +++ b/Project.toml @@ -32,11 +32,18 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f" UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +[weakdeps] +EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" + +[extensions] +EnzymeCoreExt = "EnzymeCore" + [compat] AbstractFFTs = "1.0" Adapt = "4" Atomix = "0.1" CEnum = "0.4, 0.5" +EnzymeCore = "0.7.3" ExprTools = "0.1" GPUArrays = "10" GPUCompiler = "0.27" diff --git a/a.jl b/a.jl new file mode 100644 index 000000000..966515560 --- /dev/null +++ b/a.jl @@ -0,0 +1,33 @@ +using AMDGPU +using KernelAbstractions + +function compute_tensors(tensor, kernel_fun, Nx, Ny, Nz) + kernel! = kernel_fun(get_backend(tensor)) + kernel!(tensor, Nx, Ny, Nz; ndrange=size(tensor)) + KernelAbstractions.synchronize(get_backend(tensor)) + return +end + +@kernel function kernel_xx!(tensor, Nx::Int64, Ny::Int64, Nz::Int64) + idx = @index(Global) + res = zero(eltype(tensor)) + for p in (-Nx):Nx + for q in Ny:(Ny + 2) + res += 2.0 + end + end + @inbounds tensor[idx] = res +end + +function main() + nx, ny, nz = 10, 1, 1 + Nx, Ny, Nz = 1, 1, 1 + # tensor = zeros(Float64, nx, ny, nz) + # compute_tensors(tensor, kernel_xx!, Nx, Ny, Nz) + # println("cpu:", tensor) + + tensor = AMDGPU.zeros(Float64, nx, ny, nz) + compute_tensors(tensor, kernel_xx!, Nx, Ny, Nz) + println("amd:", tensor) +end +main() diff --git a/ext/EnzymeCoreExt/EnzymeCoreExt.jl b/ext/EnzymeCoreExt/EnzymeCoreExt.jl new file mode 100644 index 000000000..46e621738 --- /dev/null +++ b/ext/EnzymeCoreExt/EnzymeCoreExt.jl @@ -0,0 +1,216 @@ +module EnzymeCoreExt + +using AMDGPU +using EnzymeCore +using EnzymeCore: EnzymeRules +using GPUCompiler + +function EnzymeCore.compiler_job_from_backend( + ::ROCBackend, @nospecialize(F::Type), @nospecialize(TT::Type), +) + mi = GPUCompiler.methodinstance(F, TT) + return GPUCompiler.CompilerJob(mi, AMDGPU.compiler_config(AMDGPU.device())) +end + +function EnzymeRules.forward( + fn::Const{typeof(AMDGPU.hipfunction)}, ::Type{<: Duplicated}, + f::Const{F}, tt::Const{TT}; kwargs... +) where {F, TT} + res = fn.val(f.val, tt.val; kwargs...) + return Duplicated(res, res) +end + +function EnzymeRules.forward( + fn::Const{typeof(AMDGPU.hipfunction)}, ::Type{<: BatchDuplicated{T, N}}, + f::Const{F}, tt::Const{TT}; kwargs... +) where {F, TT, T, N} + res = fn.val(f.val, tt.val; kwargs...) + return BatchDuplicated(res, ntuple(_ -> res, Val(N))) +end + +function EnzymeRules.reverse( + config, fn::Const{typeof(AMDGPU.hipfunction)}, + ::Type{RT}, subtape, f, tt; kwargs..., +) where RT + return (nothing, nothing) +end + +function EnzymeRules.forward( + fn::Const{typeof(AMDGPU.rocconvert)}, ::Type{RT}, x::IT, +) where {RT, IT} + if RT <: Duplicated + Duplicated(fn.val(x.val), fn.val(x.dval)) + elseif RT <: Const + fn.val(x.val)::eltype(RT) + elseif RT <: DuplicatedNoNeed + fn.val(x.val)::eltype(RT) + else + tup = ntuple(Val(EnzymeCore.batch_size(RT))) do i + Base.@_inline_meta + fn.val(x.dval[i])::eltype(RT) + end + if RT <: BatchDuplicated + BatchDuplicated(ofv.val(x.val), tup) + else + tup + end + end +end + +function EnzymeRules.reverse( + config, fn::Const{typeof(AMDGPU.rocconvert)}, + ::Type{RT}, tape, x::IT, +) where {RT, IT} + return (nothing,) +end + +function meta_fn(fn, args::Vararg{Any, N}) where N + EnzymeCore.autodiff_deferred(Forward, fn, Const, args...) + return +end + +function EnzymeRules.forward( + fn::EnzymeCore.Annotation{AMDGPU.Runtime.HIPKernel{F, TT}}, + ::Type{Const{Nothing}}, args...; kwargs..., +) where {F, TT} + GC.@preserve args begin + kernel_args = ((rocconvert(a) for a in args)...,) + kernel_tt = Tuple{(F, (typeof(a) for a in kernel_args)...)...} + kernel = AMDGPU.hipfunction(meta_fn, kernel_tt) + kernel(fn.val.f, args...; kwargs...) + end + return +end + +function EnzymeRules.augmented_primal( + config, fn::Const{typeof(AMDGPU.rocconvert)}, ::Type{RT}, x::IT, +) where {RT, IT} + primal = EnzymeRules.needs_primal(config) ? + fn.val(x.val) : nothing + primal_T = EnzymeRules.needs_primal(config) ? eltype(RT) : Nothing + + shadow = if EnzymeRules.needs_shadow(config) + if EnzymeRules.width(config) == 1 + fn.val(x.dval) + else + ntuple(Val(EnzymeRules.width(config))) do i + Base.@_inline_meta + fn.val(x.dval[i]) + end + end + else + nothing + end + shadow_T = EnzymeRules.needs_shadow(config) ? + (EnzymeRules.width(config) == 1 ? + eltype(RT) : NTuple{EnzymeRules.width(config), eltype(RT)}) : + Nothing + + return EnzymeRules.AugmentedReturn{primal_T, shadow_T, Nothing}( + primal, shadow, nothing) +end + +function EnzymeRules.augmented_primal( + config, fn::Const{typeof(AMDGPU.hipfunction)}, + ::Type{RT}, f::Const{F}, + tt::Const{TT}; kwargs... +) where {F, CT, RT <: EnzymeCore.Annotation{CT}, TT} + res = fn.val(f.val, tt.val; kwargs...) + + primal = EnzymeRules.needs_primal(config) ? res : nothing + primal_T = EnzymeRules.needs_primal(config) ? CT : Nothing + + shadow = if EnzymeRules.needs_shadow(config) + if EnzymeRules.width(config) == 1 + res + else + ntuple(Val(EnzymeRules.width(config))) do i + Base.@_inline_meta + res + end + end + else + nothing + end + shadow_T = EnzymeRules.needs_shadow(config) ? + (EnzymeRules.width(config) == 1 ? + CT : NTuple{EnzymeRules.width(config), CT}) : + Nothing + + return EnzymeRules.AugmentedReturn{primal_T, shadow_T, Nothing}( + primal, shadow, nothing) +end + +function meta_augf( + f, tape::ROCDeviceArray{TapeType}, ::Val{ModifiedBetween}, args::Vararg{Any, N}, +) where {N, ModifiedBetween, TapeType} + forward, _ = EnzymeCore.autodiff_deferred_thunk( + ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)), + TapeType, + Const{Core.Typeof(f)}, + Const{Nothing}, + map(typeof, args)..., + ) + + idx = 0 + # idx *= gridDim().x + idx += workgroupIdx().x - 1 + + idx *= gridGroupDim().y + idx += workgroupIdx().y - 1 + + idx *= gridGroupDim().z + idx += workgroupIdx().z - 1 + + idx *= workgroupDim().x + idx += workitemIdx().x - 1 + + idx *= workgroupDim().y + idx += workitemIdx().y - 1 + + idx *= workgroupDim().z + idx += workitemIdx().z - 1 + idx += 1 + + @inbounds tape[idx] = forward(Const(f), args...)[1] + return +end + +function EnzymeRules.augmented_primal( + config, fn::EnzymeCore.Annotation{AMDGPU.Runtime.HIPKernel{F,TT}}, + ::Type{Const{Nothing}}, args...; + groupsize::AMDGPU.Runtime.ROCDim = 1, + gridsize::AMDGPU.Runtime.ROCDim = 1, kwargs..., +) where {F,TT} + kernel_args = ((rocconvert(a) for a in args)...,) + kernel_tt = map(typeof, kernel_args) + + ModifiedBetween = EnzymeRules.overwritten(config) + compiler_job = EnzymeCore.compiler_job_from_backend( + ROCBackend(), typeof(Base.identity), Tuple{Float64}) + TapeType = EnzymeCore.tape_type( + compiler_job, + ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)), + Const{F}, Const{Nothing}, + kernel_tt..., + ) + threads = AMDGPU.Runtime.ROCDim3(groupsize) + blocks = AMDGPU.Runtime.ROCDim3(gridsize) + subtape = ROCArray{TapeType}( + undef, blocks.x * blocks.y * blocks.z * threads.x * threads.y * threads.z) + + GC.@preserve args subtape begin + subtape_cc = rocconvert(subtape) + kernel_tt2 = Tuple{( + F, typeof(subtape_cc), Val{ModifiedBetween}, kernel_tt..., + )...} + kernel = AMDGPU.hipfunction(meta_augf, kernel_tt2) + kernel(fn.val.f, subtape_cc, Val(ModifiedBetween), args...; + groupsize=(groupsize.x, groupsize.y, groupsize.z), + gridsize=(gridsize.x, gridsize.y, gridsize.z), + kwargs...) + end + return AugmentedReturn{Nothing, Nothing, ROCArray}(nothing, nothing, subtape) +end + +end diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl index a923c7308..0dc148030 100644 --- a/src/AMDGPU.jl +++ b/src/AMDGPU.jl @@ -71,8 +71,8 @@ using .ROCmDiscovery include("utils.jl") -include(joinpath("hsa", "HSA.jl")) -include(joinpath("hip", "HIP.jl")) +include("hsa/HSA.jl") +include("hip/HIP.jl") using .HIP using .HIP: HIPContext, HIPDevice, HIPStream @@ -107,7 +107,7 @@ export sync_workgroup, sync_workgroup_count, sync_workgroup_and, sync_workgroup_ include("compiler/Compiler.jl") import .Compiler -import .Compiler: hipfunction +import .Compiler: hipfunction, compiler_config include("tls.jl") include("highlevel.jl") @@ -126,12 +126,12 @@ include("kernels/reverse.jl") allowscalar(x::Bool) = GPUArrays.allowscalar(x) -include(joinpath("blas", "rocBLAS.jl")) -include(joinpath("solver", "rocSOLVER.jl")) -include(joinpath("sparse", "rocSPARSE.jl")) -include(joinpath("rand", "rocRAND.jl")) -include(joinpath("fft", "rocFFT.jl")) -include(joinpath("dnn", "MIOpen.jl")) +include("blas/rocBLAS.jl") +include("solver/rocSOLVER.jl") +include("sparse/rocSPARSE.jl") +include("rand/rocRAND.jl") +include("fft/rocFFT.jl") +include("dnn/MIOpen.jl") include("random.jl") diff --git a/t.jl b/t.jl new file mode 100644 index 000000000..90544aa7f --- /dev/null +++ b/t.jl @@ -0,0 +1,24 @@ +using AMDGPU +using EnzymeCore, Enzyme + +function square_kernel!(x) + i = workitemIdx().x + x[i] *= x[i] + return +end + +function square!(x) + @roc groupsize=length(x) gridsize=1 square_kernel!(x) + return +end + +function main() + A = ROCArray(collect(1.0:64.0)) + dA = ROCArray(ones(Float64, 64)) + Enzyme.autodiff(Reverse, square!, Duplicated(A, dA)) + @show A + @show dA + @assert all(dA .≈ (2:2:128)) + return +end +main() diff --git a/test/enzyme_tests.jl b/test/enzyme_tests.jl new file mode 100644 index 000000000..da25e71e2 --- /dev/null +++ b/test/enzyme_tests.jl @@ -0,0 +1,51 @@ +@testitem "enzyme" begin + +using AMDGPU +using EnzymeCore, Enzyme +using GPUCompiler + +@testset "CompilerJob from backend" begin + job = EnzymeCore.compiler_job_from_backend(ROCBackend(), typeof(()->nothing), Tuple{}) + @test job isa GPUCompiler.CompilerJob +end + +function square_kernel!(x) + i = workitemIdx().x + x[i] *= x[i] + return +end + +function square!(x) + @roc groupsize=length(x) gridsize=1 square_kernel!(x) + return nothing +end + +# @testset "Forward Kernel" begin +# A = ROCArray(collect(1.0:64.0)) +# dA = ROCArray(ones(Float64, 64)) +# Enzyme.autodiff(Forward, square!, Duplicated(A, dA)) +# @test all(dA .≈ (2:2:128)) + +# A = ROCArray(collect(1.0:64.0)) +# dA = ROCArray(ones(Float64, 64)) +# dA2 = ROCArray(ones(Float64, 64) .* 3.0) +# Enzyme.autodiff(Forward, square!, BatchDuplicated(A, (dA, dA2))) +# @test all(dA .≈ (2:2:128)) +# @test all(dA2 .≈ (2:2:128) .* 3) +# end + +@testset "Reverse Kernel" begin + A = ROCArray(collect(1.0:64.0)) + dA = ROCArray(ones(Float64, 64)) + Enzyme.autodiff(Reverse, square!, Duplicated(A, dA)) + @test all(dA .≈ (2:2:128)) + + A = ROCArray(collect(1.0:64.0)) + dA = ROCArray(ones(Float64, 64)) + dA2 = ROCArray(ones(Float64, 64) .* 3.0) + Enzyme.autodiff(Reverse, square!, BatchDuplicated(A, (dA, dA2))) + @test all(dA .≈ (2:2:128)) + @test all(dA2 .≈ (2:2:128) .* 3) +end + +end diff --git a/test/runtests.jl b/test/runtests.jl index 18e44e365..5369a77b5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,8 +2,10 @@ using AMDGPU using AMDGPU: Device, Runtime, @allowscalar import AMDGPU.Device: HostCallHolder, hostcall! +import Pkg import PrettyTables import InteractiveUtils + using LinearAlgebra using ReTestItems using Test @@ -30,7 +32,7 @@ end AMDGPU.allowscalar(false) -const TEST_NAMES = ["core", "hip", "ext", "gpuarrays", "kernelabstractions"] +const TEST_NAMES = ["core", "hip", "ext", "gpuarrays", "kernelabstractions", "enzyme"] function parse_flags!(args, flag; default = nothing, typ = typeof(default)) for f in args @@ -88,7 +90,14 @@ for test_name in ARGS """) end -const TARGET_TESTS = isempty(ARGS) ? TEST_NAMES : ARGS +# Do not run Enzyme tests by default. +const TARGET_TESTS = isempty(ARGS) ? + [t for t in TEST_NAMES if t != "enzyme"] : + ARGS + +if "enzyme" in TARGET_TESTS + Pkg.add(["EnzymeCore", "Enzyme"]) +end # Run tests in parallel. np = set_jobs ? jobs : (Sys.CPU_THREADS ÷ 2)