diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 1d4bc2929..c528a7402 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -83,6 +83,25 @@ steps:
       JULIA_AMDGPU_HIP_MUST_LOAD: "1"
       JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
 
+  - label: "Julia 1.10 Enzyme"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+      - JuliaCI/julia-test#v1:
+          test_args: "enzyme"
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    if: build.message !~ /\[skip tests\]/
+    command: "julia --project -e 'using Pkg; Pkg.update()'"
+    timeout_in_minutes: 180
+    env:
+      JULIA_NUM_THREADS: 4
+      JULIA_AMDGPU_CORE_MUST_LOAD: "1"
+      JULIA_AMDGPU_HIP_MUST_LOAD: "1"
+      JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
+
   - label: "GPU-less environment"
     plugins:
       - JuliaCI/julia#v1:
diff --git a/Project.toml b/Project.toml
index fe3367731..7e8e04b86 100644
--- a/Project.toml
+++ b/Project.toml
@@ -32,11 +32,18 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"
 UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
+[weakdeps]
+EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
+
+[extensions]
+EnzymeCoreExt = "EnzymeCore"
+
 [compat]
 AbstractFFTs = "1.0"
 Adapt = "4"
 Atomix = "0.1"
 CEnum = "0.4, 0.5"
+EnzymeCore = "0.7.3"
 ExprTools = "0.1"
 GPUArrays = "10"
 GPUCompiler = "0.27"
diff --git a/a.jl b/a.jl
new file mode 100644
index 000000000..966515560
--- /dev/null
+++ b/a.jl
@@ -0,0 +1,33 @@
+using AMDGPU
+using KernelAbstractions
+
+function compute_tensors(tensor, kernel_fun, Nx, Ny, Nz)
+    kernel! = kernel_fun(get_backend(tensor))
+    kernel!(tensor, Nx, Ny, Nz; ndrange=size(tensor))
+    KernelAbstractions.synchronize(get_backend(tensor))
+    return
+end
+
+@kernel function kernel_xx!(tensor, Nx::Int64, Ny::Int64, Nz::Int64)
+    idx = @index(Global)
+    res = zero(eltype(tensor))
+    for p in (-Nx):Nx
+        for q in Ny:(Ny + 2)
+            res += 2.0
+        end
+    end
+    @inbounds tensor[idx] = res
+end
+
+function main()
+    nx, ny, nz = 10, 1, 1
+    Nx, Ny, Nz = 1, 1, 1
+    # tensor = zeros(Float64, nx, ny, nz)
+    # compute_tensors(tensor, kernel_xx!, Nx, Ny, Nz)
+    # println("cpu:", tensor)
+
+    tensor = AMDGPU.zeros(Float64, nx, ny, nz)
+    compute_tensors(tensor, kernel_xx!, Nx, Ny, Nz)
+    println("amd:", tensor)
+end
+main()
diff --git a/ext/EnzymeCoreExt/EnzymeCoreExt.jl b/ext/EnzymeCoreExt/EnzymeCoreExt.jl
new file mode 100644
index 000000000..46e621738
--- /dev/null
+++ b/ext/EnzymeCoreExt/EnzymeCoreExt.jl
@@ -0,0 +1,216 @@
+module EnzymeCoreExt
+
+using AMDGPU
+using EnzymeCore
+using EnzymeCore: EnzymeRules
+using GPUCompiler
+
+function EnzymeCore.compiler_job_from_backend(
+    ::ROCBackend, @nospecialize(F::Type), @nospecialize(TT::Type),
+)
+    mi = GPUCompiler.methodinstance(F, TT)
+    return GPUCompiler.CompilerJob(mi, AMDGPU.compiler_config(AMDGPU.device()))
+end
+
+function EnzymeRules.forward(
+    fn::Const{typeof(AMDGPU.hipfunction)}, ::Type{<: Duplicated},
+    f::Const{F}, tt::Const{TT}; kwargs...
+) where {F, TT}
+    res = fn.val(f.val, tt.val; kwargs...)
+    return Duplicated(res, res)
+end
+
+function EnzymeRules.forward(
+    fn::Const{typeof(AMDGPU.hipfunction)}, ::Type{<: BatchDuplicated{T, N}},
+    f::Const{F}, tt::Const{TT}; kwargs...
+) where {F, TT, T, N}
+    res = fn.val(f.val, tt.val; kwargs...)
+    return BatchDuplicated(res, ntuple(_ -> res, Val(N)))
+end
+
+function EnzymeRules.reverse(
+    config, fn::Const{typeof(AMDGPU.hipfunction)},
+    ::Type{RT}, subtape, f, tt; kwargs...,
+) where RT
+    return (nothing, nothing)
+end
+
+function EnzymeRules.forward(
+    fn::Const{typeof(AMDGPU.rocconvert)}, ::Type{RT}, x::IT,
+) where {RT, IT}
+    if RT <: Duplicated
+        Duplicated(fn.val(x.val), fn.val(x.dval))
+    elseif RT <: Const
+        fn.val(x.val)::eltype(RT)
+    elseif RT <: DuplicatedNoNeed
+        fn.val(x.val)::eltype(RT)
+    else
+        tup = ntuple(Val(EnzymeCore.batch_size(RT))) do i
+            Base.@_inline_meta
+            fn.val(x.dval[i])::eltype(RT)
+        end
+        if RT <: BatchDuplicated
+            BatchDuplicated(ofv.val(x.val), tup)
+        else
+            tup
+        end
+    end
+end
+
+function EnzymeRules.reverse(
+    config, fn::Const{typeof(AMDGPU.rocconvert)},
+    ::Type{RT}, tape, x::IT,
+) where {RT, IT}
+    return (nothing,)
+end
+
+function meta_fn(fn, args::Vararg{Any, N}) where N
+    EnzymeCore.autodiff_deferred(Forward, fn, Const, args...)
+    return
+end
+
+function EnzymeRules.forward(
+    fn::EnzymeCore.Annotation{AMDGPU.Runtime.HIPKernel{F, TT}},
+    ::Type{Const{Nothing}}, args...; kwargs...,
+) where {F, TT}
+    GC.@preserve args begin
+        kernel_args = ((rocconvert(a) for a in args)...,)
+        kernel_tt = Tuple{(F, (typeof(a) for a in kernel_args)...)...}
+        kernel = AMDGPU.hipfunction(meta_fn, kernel_tt)
+        kernel(fn.val.f, args...; kwargs...)
+    end
+    return
+end
+
+function EnzymeRules.augmented_primal(
+    config, fn::Const{typeof(AMDGPU.rocconvert)}, ::Type{RT}, x::IT,
+) where {RT, IT}
+    primal = EnzymeRules.needs_primal(config) ?
+        fn.val(x.val) : nothing
+    primal_T = EnzymeRules.needs_primal(config) ? eltype(RT) : Nothing
+
+    shadow = if EnzymeRules.needs_shadow(config)
+        if EnzymeRules.width(config) == 1
+            fn.val(x.dval)
+        else
+            ntuple(Val(EnzymeRules.width(config))) do i
+                Base.@_inline_meta
+                fn.val(x.dval[i])
+            end
+        end
+    else
+        nothing
+    end
+    shadow_T = EnzymeRules.needs_shadow(config) ?
+        (EnzymeRules.width(config) == 1 ?
+            eltype(RT) : NTuple{EnzymeRules.width(config), eltype(RT)}) :
+        Nothing
+
+    return EnzymeRules.AugmentedReturn{primal_T, shadow_T, Nothing}(
+        primal, shadow, nothing)
+end
+
+function EnzymeRules.augmented_primal(
+    config, fn::Const{typeof(AMDGPU.hipfunction)},
+    ::Type{RT}, f::Const{F},
+    tt::Const{TT}; kwargs...
+) where {F, CT, RT <: EnzymeCore.Annotation{CT}, TT}
+    res = fn.val(f.val, tt.val; kwargs...)
+
+    primal = EnzymeRules.needs_primal(config) ? res : nothing
+    primal_T = EnzymeRules.needs_primal(config) ? CT : Nothing
+
+    shadow = if EnzymeRules.needs_shadow(config)
+        if EnzymeRules.width(config) == 1
+            res
+        else
+          ntuple(Val(EnzymeRules.width(config))) do i
+              Base.@_inline_meta
+              res
+          end
+        end
+    else
+        nothing
+    end
+    shadow_T = EnzymeRules.needs_shadow(config) ?
+        (EnzymeRules.width(config) == 1 ?
+            CT : NTuple{EnzymeRules.width(config), CT}) :
+        Nothing
+
+    return EnzymeRules.AugmentedReturn{primal_T, shadow_T, Nothing}(
+        primal, shadow, nothing)
+end
+
+function meta_augf(
+    f, tape::ROCDeviceArray{TapeType}, ::Val{ModifiedBetween}, args::Vararg{Any, N},
+) where {N, ModifiedBetween, TapeType}
+    forward, _ = EnzymeCore.autodiff_deferred_thunk(
+        ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)),
+        TapeType,
+        Const{Core.Typeof(f)},
+        Const{Nothing},
+        map(typeof, args)...,
+    )
+
+    idx = 0
+    # idx *= gridDim().x
+    idx += workgroupIdx().x - 1
+
+    idx *= gridGroupDim().y
+    idx += workgroupIdx().y - 1
+
+    idx *= gridGroupDim().z
+    idx += workgroupIdx().z - 1
+
+    idx *= workgroupDim().x
+    idx += workitemIdx().x - 1
+
+    idx *= workgroupDim().y
+    idx += workitemIdx().y - 1
+
+    idx *= workgroupDim().z
+    idx += workitemIdx().z - 1
+    idx += 1
+
+    @inbounds tape[idx] = forward(Const(f), args...)[1]
+    return
+end
+
+function EnzymeRules.augmented_primal(
+    config, fn::EnzymeCore.Annotation{AMDGPU.Runtime.HIPKernel{F,TT}},
+    ::Type{Const{Nothing}}, args...;
+    groupsize::AMDGPU.Runtime.ROCDim = 1,
+    gridsize::AMDGPU.Runtime.ROCDim = 1, kwargs...,
+) where {F,TT}
+    kernel_args = ((rocconvert(a) for a in args)...,)
+    kernel_tt = map(typeof, kernel_args)
+
+    ModifiedBetween = EnzymeRules.overwritten(config)
+    compiler_job = EnzymeCore.compiler_job_from_backend(
+        ROCBackend(), typeof(Base.identity), Tuple{Float64})
+    TapeType = EnzymeCore.tape_type(
+        compiler_job,
+        ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)),
+        Const{F}, Const{Nothing},
+        kernel_tt...,
+    )
+    threads = AMDGPU.Runtime.ROCDim3(groupsize)
+    blocks = AMDGPU.Runtime.ROCDim3(gridsize)
+    subtape = ROCArray{TapeType}(
+        undef, blocks.x * blocks.y * blocks.z * threads.x * threads.y * threads.z)
+
+    GC.@preserve args subtape begin
+        subtape_cc = rocconvert(subtape)
+        kernel_tt2 = Tuple{(
+            F, typeof(subtape_cc), Val{ModifiedBetween}, kernel_tt...,
+        )...}
+        kernel = AMDGPU.hipfunction(meta_augf, kernel_tt2)
+        kernel(fn.val.f, subtape_cc, Val(ModifiedBetween), args...;
+            groupsize=(groupsize.x, groupsize.y, groupsize.z),
+            gridsize=(gridsize.x, gridsize.y, gridsize.z),
+            kwargs...)
+    end
+    return AugmentedReturn{Nothing, Nothing, ROCArray}(nothing, nothing, subtape)
+end
+
+end
diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl
index a923c7308..0dc148030 100644
--- a/src/AMDGPU.jl
+++ b/src/AMDGPU.jl
@@ -71,8 +71,8 @@ using .ROCmDiscovery
 
 include("utils.jl")
 
-include(joinpath("hsa", "HSA.jl"))
-include(joinpath("hip", "HIP.jl"))
+include("hsa/HSA.jl")
+include("hip/HIP.jl")
 
 using .HIP
 using .HIP: HIPContext, HIPDevice, HIPStream
@@ -107,7 +107,7 @@ export sync_workgroup, sync_workgroup_count, sync_workgroup_and, sync_workgroup_
 
 include("compiler/Compiler.jl")
 import .Compiler
-import .Compiler: hipfunction
+import .Compiler: hipfunction, compiler_config
 
 include("tls.jl")
 include("highlevel.jl")
@@ -126,12 +126,12 @@ include("kernels/reverse.jl")
 
 allowscalar(x::Bool) = GPUArrays.allowscalar(x)
 
-include(joinpath("blas", "rocBLAS.jl"))
-include(joinpath("solver", "rocSOLVER.jl"))
-include(joinpath("sparse", "rocSPARSE.jl"))
-include(joinpath("rand", "rocRAND.jl"))
-include(joinpath("fft", "rocFFT.jl"))
-include(joinpath("dnn", "MIOpen.jl"))
+include("blas/rocBLAS.jl")
+include("solver/rocSOLVER.jl")
+include("sparse/rocSPARSE.jl")
+include("rand/rocRAND.jl")
+include("fft/rocFFT.jl")
+include("dnn/MIOpen.jl")
 
 include("random.jl")
 
diff --git a/t.jl b/t.jl
new file mode 100644
index 000000000..90544aa7f
--- /dev/null
+++ b/t.jl
@@ -0,0 +1,24 @@
+using AMDGPU
+using EnzymeCore, Enzyme
+
+function square_kernel!(x)
+    i = workitemIdx().x
+    x[i] *= x[i]
+    return
+end
+
+function square!(x)
+    @roc groupsize=length(x) gridsize=1 square_kernel!(x)
+    return
+end
+
+function main()
+    A = ROCArray(collect(1.0:64.0))
+    dA = ROCArray(ones(Float64, 64))
+    Enzyme.autodiff(Reverse, square!, Duplicated(A, dA))
+    @show A
+    @show dA
+    @assert all(dA .≈ (2:2:128))
+    return
+end
+main()
diff --git a/test/enzyme_tests.jl b/test/enzyme_tests.jl
new file mode 100644
index 000000000..da25e71e2
--- /dev/null
+++ b/test/enzyme_tests.jl
@@ -0,0 +1,51 @@
+@testitem "enzyme" begin
+
+using AMDGPU
+using EnzymeCore, Enzyme
+using GPUCompiler
+
+@testset "CompilerJob from backend" begin
+    job = EnzymeCore.compiler_job_from_backend(ROCBackend(), typeof(()->nothing), Tuple{})
+    @test job isa GPUCompiler.CompilerJob
+end
+
+function square_kernel!(x)
+    i = workitemIdx().x
+    x[i] *= x[i]
+    return
+end
+
+function square!(x)
+    @roc groupsize=length(x) gridsize=1 square_kernel!(x)
+    return nothing
+end
+
+# @testset "Forward Kernel" begin
+#     A = ROCArray(collect(1.0:64.0))
+#     dA = ROCArray(ones(Float64, 64))
+#     Enzyme.autodiff(Forward, square!, Duplicated(A, dA))
+#     @test all(dA .≈ (2:2:128))
+
+#     A = ROCArray(collect(1.0:64.0))
+#     dA = ROCArray(ones(Float64, 64))
+#     dA2 = ROCArray(ones(Float64, 64) .* 3.0)
+#     Enzyme.autodiff(Forward, square!, BatchDuplicated(A, (dA, dA2)))
+#     @test all(dA .≈ (2:2:128))
+#     @test all(dA2 .≈ (2:2:128) .* 3)
+# end
+
+@testset "Reverse Kernel" begin
+    A = ROCArray(collect(1.0:64.0))
+    dA = ROCArray(ones(Float64, 64))
+    Enzyme.autodiff(Reverse, square!, Duplicated(A, dA))
+    @test all(dA .≈ (2:2:128))
+
+    A = ROCArray(collect(1.0:64.0))
+    dA = ROCArray(ones(Float64, 64))
+    dA2 = ROCArray(ones(Float64, 64) .* 3.0)
+    Enzyme.autodiff(Reverse, square!, BatchDuplicated(A, (dA, dA2)))
+    @test all(dA .≈ (2:2:128))
+    @test all(dA2 .≈ (2:2:128) .* 3)
+end
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 18e44e365..5369a77b5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,8 +2,10 @@ using AMDGPU
 using AMDGPU: Device, Runtime, @allowscalar
 import AMDGPU.Device: HostCallHolder, hostcall!
 
+import Pkg
 import PrettyTables
 import InteractiveUtils
+
 using LinearAlgebra
 using ReTestItems
 using Test
@@ -30,7 +32,7 @@ end
 
 AMDGPU.allowscalar(false)
 
-const TEST_NAMES = ["core", "hip", "ext", "gpuarrays", "kernelabstractions"]
+const TEST_NAMES = ["core", "hip", "ext", "gpuarrays", "kernelabstractions", "enzyme"]
 
 function parse_flags!(args, flag; default = nothing, typ = typeof(default))
     for f in args
@@ -88,7 +90,14 @@ for test_name in ARGS
     """)
 end
 
-const TARGET_TESTS = isempty(ARGS) ? TEST_NAMES : ARGS
+# Do not run Enzyme tests by default.
+const TARGET_TESTS = isempty(ARGS) ?
+    [t for t in TEST_NAMES if t != "enzyme"] :
+    ARGS
+
+if "enzyme" in TARGET_TESTS
+    Pkg.add(["EnzymeCore", "Enzyme"])
+end
 
 # Run tests in parallel.
 np = set_jobs ? jobs : (Sys.CPU_THREADS ÷ 2)