From cc510acbca3c2c9e93fb646ce768f95dc5e3b597 Mon Sep 17 00:00:00 2001 From: brabreda Date: Mon, 14 Aug 2023 13:12:39 +0200 Subject: [PATCH 01/14] kernel config struct --- src/KernelAbstractions.jl | 3 +++ src/config.jl | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 src/config.jl diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 335d2cd9..3fa3e74f 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -726,4 +726,7 @@ end end end +# Config +include("config.jl") + end #module diff --git a/src/config.jl b/src/config.jl new file mode 100644 index 00000000..95712745 --- /dev/null +++ b/src/config.jl @@ -0,0 +1,35 @@ +# Kernel config struct + +export Config + +struct Config{ + GROUPSIZE, + MAX_NDRANGE, + + ITEMS_PER_WORKITEM, + USE_ATOMICS, + USE_WARPS + } + + function Config(groupsize, max_ndrange, items_per_workitem , use_atomics, use_warps) + new{groupsize, max_ndrange, items_per_workitem, use_atomics, use_warps}() + end +end + +@inline function Base.getproperty(conf::Config{GROUPSIZE, MAX_NDRANGE, ITEMS_PER_WORKITEM, USE_ATOMICS, USE_WARPS}, sym::Symbol) where { GROUPSIZE, MAX_NDRANGE,ITEMS_PER_WORKITEM, USE_ATOMICS, USE_WARPS } + + if sym == :groupsize + GROUPSIZE + elseif sym == :max_ndrange + MAX_NDRANGE + elseif sym == :items_per_workitem + ITEMS_PER_WORKITEM + elseif sym == :use_atomics + USE_ATOMICS + elseif sym == :use_warps + USE_WARPS + else + # fallback for nothing + getfield(conf, sym) + end +end \ No newline at end of file From 1c1e459080bf4ba5a91cb257cab9d12812567d0c Mon Sep 17 00:00:00 2001 From: brabreda Date: Mon, 14 Aug 2023 13:38:34 +0200 Subject: [PATCH 02/14] group- and warpreduce --- src/reduce.jl | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 src/reduce.jl diff --git a/src/reduce.jl b/src/reduce.jl new file mode 100644 index 00000000..09cb8828 --- /dev/null +++ b/src/reduce.jl @@ -0,0 +1,121 @@ +export groupreduce, warpreduce + +macro groupreduce(op, val, neutral, conf) + quote + $__groupreduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), typeof($(esc(val))), Val($(esc(conf)).use_warps)) + end +end + +macro warpreduce(op, val) + quote + $__warpreduce(esc(op))($(esc(val))) + end +end + + +@inline _map_getindex(args::Tuple, I) = ((args[1][I]), _map_getindex(Base.tail(args), I)...) +@inline _map_getindex(args::Tuple{Any}, I) = ((args[1][I]),) +@inline _map_getindex(args::Tuple{}, I) = () + + +# groupreduction using warp intrinsics +@inline function __groupreduce(__ctx__, op, val, neutral, ::Type{T}, ::Val{true}) where {T} + threadIdx_local = KernelAbstractions.@index(Local) + groupsize = KernelAbstractions.@groupsize()[1] + + shared = KernelAbstractions.@localmem(T, 32) + + warpIdx, warpLane = fldmod1(threadIdx_local, 32) + + # each warp performs partial reduction + val = KernelAbstractions.@warpreduce(op, val) + + # write reduced value to shared memory + if warpLane == 1 + @inbounds shared[warpIdx] = val + end + + # wait for all partial reductions + KernelAbstractions.@synchronize() + + # read from shared memory only if that warp existed + val = if threadIdx_local <= fld1(groupsize, 32) + @inbounds shared[warpLane] + else + neutral + end + + # final reduce within first warp + if warpIdx == 1 + val = KernelAbstractions.@warpreduce(op, val) + end + + return val + +end + +# groupreduction using local memory +@inline function __groupreduce(__ctx__, op, val, neutral, ::Type{T}, ::Val{false}) where {T} + threadIdx_local = KernelAbstractions.@index(Local) + groupsize = KernelAbstractions.@groupsize()[1] + + shared = KernelAbstractions.@localmem(T, groupsize) + + @inbounds shared[threadIdx_local] = val + + # perform the reduction + d = 1 + while d < groupsize + KernelAbstractions.@synchronize() + index = 2 * d * (threadIdx_local-1) + 1 + @inbounds if index <= groupsize + other_val = if index + d <= groupsize + shared[index+d] + else + neutral + end + shared[index] = op(shared[index], other_val) + end + d *= 2 + end + + # load the final value on the first thread + if threadIdx_local == 1 + val = @inbounds shared[threadIdx_local] + end + + return val +end + +@kernel function reduce_kernel(f, op, neutral, grain, R, A , conf) + # values for the kernel + threadIdx_local = @index(Local) + threadIdx_global = @index(Global) + groupIdx = @index(Group) + gridsize = @ndrange()[1] + + + # load neutral value + neutral = if neutral === nothing + R[1] + else + neutral + end + + val = op(neutral, neutral) + + # every thread reduces a few values parrallel + index = threadIdx_global + while index <= length(A) + val = op(val,A[index]) + index += gridsize + end + + # reduce every block to a single value + val = @reduce(op, val, neutral, conf) + + # write reduces value to memory + if threadIdx_local == 1 + R[groupIdx] = val + end +end From dd3a0ca06295747e9d70862913198cfe532a92ff Mon Sep 17 00:00:00 2001 From: brabreda Date: Sat, 9 Sep 2023 16:25:57 +0200 Subject: [PATCH 03/14] fixes --- .github/dependabot.yml | 7 +++ .github/workflows/ci.yml | 10 +-- src/KernelAbstractions.jl | 27 ++++++-- src/config.jl | 35 ----------- src/reduce.jl | 125 ++++++++++++++++---------------------- test/extensions/enzyme.jl | 2 +- test/reduce.jl | 39 ++++++++++++ 7 files changed, 124 insertions(+), 121 deletions(-) create mode 100644 .github/dependabot.yml delete mode 100644 src/config.jl create mode 100644 test/reduce.jl diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..d60f0707 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,7 @@ +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" # Location of package manifests + schedule: + interval: "monthly" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9564cc64..b8c4b2dd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,12 +34,12 @@ jobs: arch: - x64 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: julia-actions/setup-julia@v1 with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - - uses: actions/cache@v1 + - uses: actions/cache@v3 env: cache-name: cache-artifacts with: @@ -54,14 +54,14 @@ jobs: with: annotate: true - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v1 + - uses: codecov/codecov-action@v3 with: file: lcov.info docs: name: Documentation runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: julia-actions/setup-julia@v1 with: version: '1' @@ -75,7 +75,7 @@ jobs: name: Doctests runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: julia-actions/setup-julia@v1 with: version: '1' diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 3fa3e74f..668f4da1 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -139,10 +139,20 @@ function unsafe_free! end # - @groupsize # - @ndrange ### - function groupsize end function ndrange end +""" + @subgroupsize() + + returns the GPUs subgroupsize. +""" +macro subgroupsize() + quote + $__subgroupsize() + end +end + """ @groupsize() @@ -384,9 +394,9 @@ function __index_Local_Cartesian end function __index_Group_Cartesian end function __index_Global_Cartesian end -__index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...)) -__index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...)) -__index_Global_NTuple(ctx, I...) = Tuple(__index_Global_Cartesian(ctx, I...)) +@inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...)) +@inline __index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...)) +@inline __index_Global_NTuple(ctx, I...) = Tuple(__index_Global_Cartesian(ctx, I...)) struct ConstAdaptor end @@ -657,6 +667,10 @@ function __synchronize() error("@synchronize used outside kernel or not captured") end +function __subgroupsize() + error("@subgroupsize used outside kernel or not captured") +end + @generated function __print(items...) str = "" args = [] @@ -700,6 +714,7 @@ end @inbounds A[I] = B[I] end + # CPU backend include("cpu.jl") @@ -726,7 +741,7 @@ end end end -# Config -include("config.jl") +# group- and subgroupreduce +include("reduce.jl") end #module diff --git a/src/config.jl b/src/config.jl deleted file mode 100644 index 95712745..00000000 --- a/src/config.jl +++ /dev/null @@ -1,35 +0,0 @@ -# Kernel config struct - -export Config - -struct Config{ - GROUPSIZE, - MAX_NDRANGE, - - ITEMS_PER_WORKITEM, - USE_ATOMICS, - USE_WARPS - } - - function Config(groupsize, max_ndrange, items_per_workitem , use_atomics, use_warps) - new{groupsize, max_ndrange, items_per_workitem, use_atomics, use_warps}() - end -end - -@inline function Base.getproperty(conf::Config{GROUPSIZE, MAX_NDRANGE, ITEMS_PER_WORKITEM, USE_ATOMICS, USE_WARPS}, sym::Symbol) where { GROUPSIZE, MAX_NDRANGE,ITEMS_PER_WORKITEM, USE_ATOMICS, USE_WARPS } - - if sym == :groupsize - GROUPSIZE - elseif sym == :max_ndrange - MAX_NDRANGE - elseif sym == :items_per_workitem - ITEMS_PER_WORKITEM - elseif sym == :use_atomics - USE_ATOMICS - elseif sym == :use_warps - USE_WARPS - else - # fallback for nothing - getfield(conf, sym) - end -end \ No newline at end of file diff --git a/src/reduce.jl b/src/reduce.jl index 09cb8828..6a1a4b08 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -1,121 +1,98 @@ -export groupreduce, warpreduce +export @groupreduce, @subgroupreduce -macro groupreduce(op, val, neutral, conf) +""" + +@subgroupreduce(op, val) + +reduce values across a subgroup. This operation is only supported if subgroups are supported by the backend. +""" +macro subgroupreduce(op, val) quote - $__groupreduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), typeof($(esc(val))), Val($(esc(conf)).use_warps)) + $__subgroupreduce($(esc(op)),$(esc(val))) end end -macro warpreduce(op, val) - quote - $__warpreduce(esc(op))($(esc(val))) - end +function __subgroupreduce(op, val) + error("@subgroupreduce used outside kernel, not captured, or not supported") end +""" -@inline _map_getindex(args::Tuple, I) = ((args[1][I]), _map_getindex(Base.tail(args), I)...) -@inline _map_getindex(args::Tuple{Any}, I) = ((args[1][I]),) -@inline _map_getindex(args::Tuple{}, I) = () +@groupreduce(op, val, neutral, use_subgroups) +Reduce values across a block +- `op`: the operator of the reduction +- `val`: value that each thread contibutes to the values that need to be reduced +- `netral`: value of the operator, so that `op(netural, neutral) = neutral`` +- `use_subgroups`: make use of the subgroupreduction of the groupreduction +""" +macro groupreduce(op, val, neutral, use_subgroups) + quote + $__groupreduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), $(esc(typeof(val))), Val(use_subgroups)) + end +end -# groupreduction using warp intrinsics @inline function __groupreduce(__ctx__, op, val, neutral, ::Type{T}, ::Val{true}) where {T} - threadIdx_local = KernelAbstractions.@index(Local) - groupsize = KernelAbstractions.@groupsize()[1] + idx_in_group = @index(Local) + groupsize = @groupsize()[1] + subgroupsize = @subgroupsize() - shared = KernelAbstractions.@localmem(T, 32) + localmem = @localmem(T, subgroupsize) - warpIdx, warpLane = fldmod1(threadIdx_local, 32) + idx_subgroup, idx_in_subgroup = fldmod1(idx_in_group, subgroupsize) - # each warp performs partial reduction - val = KernelAbstractions.@warpreduce(op, val) + # first subgroup reduction + val = @subgroupreduce(op, val) - # write reduced value to shared memory - if warpLane == 1 - @inbounds shared[warpIdx] = val + # store partial results in local memory + if idx_in_subgroup == 1 + @inbounds localmem[idx_in_subgroup] = val end - # wait for all partial reductions - KernelAbstractions.@synchronize() + @synchronize() - # read from shared memory only if that warp existed - val = if threadIdx_local <= fld1(groupsize, 32) - @inbounds shared[warpLane] + val = if idx_in_subgroup <= fld1(groupsize, subgroupsize) + @inbounds localmem[idx_in_subgroup] else neutral end - # final reduce within first warp - if warpIdx == 1 - val = KernelAbstractions.@warpreduce(op, val) + # second subgroup reduction to reduce partial results + if idx_in_subgroup == 1 + val = @subgroupreduce(op, val) end return val - end -# groupreduction using local memory @inline function __groupreduce(__ctx__, op, val, neutral, ::Type{T}, ::Val{false}) where {T} - threadIdx_local = KernelAbstractions.@index(Local) - groupsize = KernelAbstractions.@groupsize()[1] + idx_in_group = @index(Local) + groupsize = @groupsize()[1] - shared = KernelAbstractions.@localmem(T, groupsize) + localmem = @localmem(T, groupsize) - @inbounds shared[threadIdx_local] = val + @inbounds localmem[idx_in_group] = val # perform the reduction d = 1 while d < groupsize - KernelAbstractions.@synchronize() - index = 2 * d * (threadIdx_local-1) + 1 + @synchronize() + index = 2 * d * (idx_in_group-1) + 1 @inbounds if index <= groupsize other_val = if index + d <= groupsize - shared[index+d] + localmem[index+d] else neutral end - shared[index] = op(shared[index], other_val) + localmem[index] = op(localmem[index], other_val) end d *= 2 end # load the final value on the first thread - if threadIdx_local == 1 - val = @inbounds shared[threadIdx_local] + if idx_in_group == 1 + val = @inbounds localmem[idx_in_group] end return val -end - -@kernel function reduce_kernel(f, op, neutral, grain, R, A , conf) - # values for the kernel - threadIdx_local = @index(Local) - threadIdx_global = @index(Global) - groupIdx = @index(Group) - gridsize = @ndrange()[1] - - - # load neutral value - neutral = if neutral === nothing - R[1] - else - neutral - end - - val = op(neutral, neutral) - - # every thread reduces a few values parrallel - index = threadIdx_global - while index <= length(A) - val = op(val,A[index]) - index += gridsize - end - - # reduce every block to a single value - val = @reduce(op, val, neutral, conf) - - # write reduces value to memory - if threadIdx_local == 1 - R[groupIdx] = val - end -end +end \ No newline at end of file diff --git a/test/extensions/enzyme.jl b/test/extensions/enzyme.jl index 6bf08a64..e65cb20b 100644 --- a/test/extensions/enzyme.jl +++ b/test/extensions/enzyme.jl @@ -10,7 +10,7 @@ end function caller(A, backend) kernel = square!(backend) kernel(A, ndrange=size(A)) - synchronize(backend) + KernelAbstractions.synchronize(backend) end function enzyme_testsuite(backend, ArrayT, supports_reverse=true) diff --git a/test/reduce.jl b/test/reduce.jl new file mode 100644 index 00000000..7406b66d --- /dev/null +++ b/test/reduce.jl @@ -0,0 +1,39 @@ +using KernelAbstractions, Test + + + + +@kernel function reduce(a, b, op, neutral) + idx_in_group = @index(Local) + + val = a[idx_in_group] + + val = @groupreduce(op, val, netral, false) + + b[1] = val +end + +function(backend, ArrayT) + @testset "groupreduce one group" begin + @testset for op in (+,*,max,min) + @testset for type in (Int32, Float32, Float64) + @test test_1group_groupreduce(backend, ArrayT ,op, type, op(neutral)) + end + end + end +end + +function test_1group_groupreduce(backend,ArrayT, op, type, neutral) + a = rand(type, 32) + b = ArrayT(a) + + c = similar(b,1) + reduce(a, c, op, neutral) + + expected = mapreduce(x->x^2, +, a) + actual = c[1] + return expected = actual +end + + + From 546e8c9cb842ee7be4ee7c3214f2daa19c57c019 Mon Sep 17 00:00:00 2001 From: brabreda Date: Tue, 29 Aug 2023 00:54:06 +0200 Subject: [PATCH 04/14] add warpsize to config --- src/reduce.jl | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/src/reduce.jl b/src/reduce.jl index 6a1a4b08..b69d18f9 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -1,4 +1,10 @@ -export @groupreduce, @subgroupreduce +export @groupreduce, @warpreduce + +macro warpreduce(op, val) + quote + $__warpreduce($(esc(op)),$(esc(val))) + end +end """ @@ -12,26 +18,12 @@ macro subgroupreduce(op, val) end end -function __subgroupreduce(op, val) - error("@subgroupreduce used outside kernel, not captured, or not supported") +function __warpreduce(op, val) + error("@warpreduce used outside kernel, not captured, or not supported") end -""" - -@groupreduce(op, val, neutral, use_subgroups) - -Reduce values across a block -- `op`: the operator of the reduction -- `val`: value that each thread contibutes to the values that need to be reduced -- `netral`: value of the operator, so that `op(netural, neutral) = neutral`` -- `use_subgroups`: make use of the subgroupreduction of the groupreduction -""" -macro groupreduce(op, val, neutral, use_subgroups) - quote - $__groupreduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), $(esc(typeof(val))), Val(use_subgroups)) - end -end +# groupreduction using warp intrinsics @inline function __groupreduce(__ctx__, op, val, neutral, ::Type{T}, ::Val{true}) where {T} idx_in_group = @index(Local) groupsize = @groupsize()[1] @@ -95,4 +87,4 @@ end end return val -end \ No newline at end of file +end From 360280830670de94f507aba31d663ad3a4127476 Mon Sep 17 00:00:00 2001 From: brabreda Date: Tue, 29 Aug 2023 16:05:45 +0200 Subject: [PATCH 05/14] adding & subgroupreduce --- src/KernelAbstractions.jl | 6 ++++++ src/reduce.jl | 12 +++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 668f4da1..c1c4829e 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -142,6 +142,12 @@ function unsafe_free! end function groupsize end function ndrange end +macro subgroupsize() + quote + $__subgroupsize() + end +end + """ @subgroupsize() diff --git a/src/reduce.jl b/src/reduce.jl index b69d18f9..5ce9c88a 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -1,8 +1,14 @@ -export @groupreduce, @warpreduce +export @groupreduce, @subgroupreduce -macro warpreduce(op, val) +@enum GroupReduceAlgorithm begin + THREADS + WARP_WARP + SEQUENTIAL_WARP +end + +macro subgroupreduce(op, val) quote - $__warpreduce($(esc(op)),$(esc(val))) + $__subgroupreduce($(esc(op)),$(esc(val))) end end From 42a796096bc9abd433d71c139bd15795699e1afa Mon Sep 17 00:00:00 2001 From: brabreda Date: Fri, 1 Sep 2023 08:45:06 +0200 Subject: [PATCH 06/14] deps --- Project.toml | 3 +++ src/reduce.jl | 13 ++----------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/Project.toml b/Project.toml index 42949387..d20d6698 100644 --- a/Project.toml +++ b/Project.toml @@ -16,6 +16,7 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f" UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" [compat] Adapt = "0.4, 1.0, 2.0, 3.0" @@ -27,8 +28,10 @@ Requires = "1.3" StaticArrays = "0.12, 1.0" UnsafeAtomics = "0.2.1" UnsafeAtomicsLLVM = "0.1" +GPUArrays = "8.8.1" julia = "1.6" + [extensions] EnzymeExt = "EnzymeCore" diff --git a/src/reduce.jl b/src/reduce.jl index 5ce9c88a..c623d6dc 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -1,16 +1,7 @@ -export @groupreduce, @subgroupreduce +using GPUArrays -@enum GroupReduceAlgorithm begin - THREADS - WARP_WARP - SEQUENTIAL_WARP -end +export @groupreduce, @subgroupreduce -macro subgroupreduce(op, val) - quote - $__subgroupreduce($(esc(op)),$(esc(val))) - end -end """ From c96a24a1dc97866b3be96e02e7b2827886291f0e Mon Sep 17 00:00:00 2001 From: brabreda Date: Wed, 6 Sep 2023 07:59:32 +0200 Subject: [PATCH 07/14] added docs & removed part for GPUArrays --- src/reduce.jl | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/reduce.jl b/src/reduce.jl index c623d6dc..6a1a4b08 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -1,8 +1,5 @@ -using GPUArrays - export @groupreduce, @subgroupreduce - """ @subgroupreduce(op, val) @@ -15,12 +12,26 @@ macro subgroupreduce(op, val) end end -function __warpreduce(op, val) - error("@warpreduce used outside kernel, not captured, or not supported") +function __subgroupreduce(op, val) + error("@subgroupreduce used outside kernel, not captured, or not supported") end +""" + +@groupreduce(op, val, neutral, use_subgroups) + +Reduce values across a block +- `op`: the operator of the reduction +- `val`: value that each thread contibutes to the values that need to be reduced +- `netral`: value of the operator, so that `op(netural, neutral) = neutral`` +- `use_subgroups`: make use of the subgroupreduction of the groupreduction +""" +macro groupreduce(op, val, neutral, use_subgroups) + quote + $__groupreduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), $(esc(typeof(val))), Val(use_subgroups)) + end +end -# groupreduction using warp intrinsics @inline function __groupreduce(__ctx__, op, val, neutral, ::Type{T}, ::Val{true}) where {T} idx_in_group = @index(Local) groupsize = @groupsize()[1] @@ -84,4 +95,4 @@ end end return val -end +end \ No newline at end of file From d2d65be472033fe0aab8f7e8d0646aeb482fe740 Mon Sep 17 00:00:00 2001 From: brabreda Date: Sat, 9 Sep 2023 13:45:23 +0200 Subject: [PATCH 08/14] added docs & tests --- src/KernelAbstractions.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index c1c4829e..b1ac8aac 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -142,6 +142,11 @@ function unsafe_free! end function groupsize end function ndrange end +""" + @subgroupsize() + + returns the GPUs subgroupsize. +""" macro subgroupsize() quote $__subgroupsize() From 128a5f0627781532536cd87f6c937ab3fefbd8c2 Mon Sep 17 00:00:00 2001 From: brabreda Date: Sat, 9 Sep 2023 14:04:20 +0200 Subject: [PATCH 09/14] Remove deps for PR --- Project.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Project.toml b/Project.toml index d20d6698..6bb57a6c 100644 --- a/Project.toml +++ b/Project.toml @@ -16,7 +16,6 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f" UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249" -GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" [compat] Adapt = "0.4, 1.0, 2.0, 3.0" @@ -28,7 +27,6 @@ Requires = "1.3" StaticArrays = "0.12, 1.0" UnsafeAtomics = "0.2.1" UnsafeAtomicsLLVM = "0.1" -GPUArrays = "8.8.1" julia = "1.6" From b899685c0a45eaeaf2cdc17b3ddda60467f388a4 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 22 Aug 2023 12:44:08 -0400 Subject: [PATCH 10/14] Ensure NTuple index functions are inlined (#414) --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 6bb57a6c..b5f2534e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "KernelAbstractions" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" authors = ["Valentin Churavy and contributors"] -version = "0.9.8" +version = "0.9.9" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" From 1cdb6d640d0dfc4f559c27a2aa0dbf2114c805c7 Mon Sep 17 00:00:00 2001 From: brabreda Date: Sat, 9 Sep 2023 16:15:30 +0200 Subject: [PATCH 11/14] manifest.toml, reset project.toml removed config include removed using GPUArrays versions --- Project.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Project.toml b/Project.toml index b5f2534e..47a5f9ef 100644 --- a/Project.toml +++ b/Project.toml @@ -29,7 +29,6 @@ UnsafeAtomics = "0.2.1" UnsafeAtomicsLLVM = "0.1" julia = "1.6" - [extensions] EnzymeExt = "EnzymeCore" From 41356d323d3036cffceac4de32b822a176cad452 Mon Sep 17 00:00:00 2001 From: brabreda Date: Thu, 14 Sep 2023 22:18:02 +0200 Subject: [PATCH 12/14] move groupreduce with subgroups to backends --- src/KernelAbstractions.jl | 26 +---------------- src/reduce.jl | 59 ++++----------------------------------- test/reduce.jl | 2 +- 3 files changed, 7 insertions(+), 80 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index b1ac8aac..da68d06c 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -142,27 +142,6 @@ function unsafe_free! end function groupsize end function ndrange end -""" - @subgroupsize() - - returns the GPUs subgroupsize. -""" -macro subgroupsize() - quote - $__subgroupsize() - end -end - -""" - @subgroupsize() - - returns the GPUs subgroupsize. -""" -macro subgroupsize() - quote - $__subgroupsize() - end -end """ @groupsize() @@ -678,9 +657,6 @@ function __synchronize() error("@synchronize used outside kernel or not captured") end -function __subgroupsize() - error("@subgroupsize used outside kernel or not captured") -end @generated function __print(items...) str = "" @@ -752,7 +728,7 @@ end end end -# group- and subgroupreduce +# groupreduce include("reduce.jl") end #module diff --git a/src/reduce.jl b/src/reduce.jl index 6a1a4b08..6a385937 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -1,24 +1,8 @@ -export @groupreduce, @subgroupreduce +export @groupreduce """ -@subgroupreduce(op, val) - -reduce values across a subgroup. This operation is only supported if subgroups are supported by the backend. -""" -macro subgroupreduce(op, val) - quote - $__subgroupreduce($(esc(op)),$(esc(val))) - end -end - -function __subgroupreduce(op, val) - error("@subgroupreduce used outside kernel, not captured, or not supported") -end - -""" - -@groupreduce(op, val, neutral, use_subgroups) + @groupreduce(op, val, neutral, use_subgroups) Reduce values across a block - `op`: the operator of the reduction @@ -26,46 +10,13 @@ Reduce values across a block - `netral`: value of the operator, so that `op(netural, neutral) = neutral`` - `use_subgroups`: make use of the subgroupreduction of the groupreduction """ -macro groupreduce(op, val, neutral, use_subgroups) +macro groupreduce(op, val, neutral) quote - $__groupreduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), $(esc(typeof(val))), Val(use_subgroups)) + $__groupreduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), $(esc(typeof(val)))) end end -@inline function __groupreduce(__ctx__, op, val, neutral, ::Type{T}, ::Val{true}) where {T} - idx_in_group = @index(Local) - groupsize = @groupsize()[1] - subgroupsize = @subgroupsize() - - localmem = @localmem(T, subgroupsize) - - idx_subgroup, idx_in_subgroup = fldmod1(idx_in_group, subgroupsize) - - # first subgroup reduction - val = @subgroupreduce(op, val) - - # store partial results in local memory - if idx_in_subgroup == 1 - @inbounds localmem[idx_in_subgroup] = val - end - - @synchronize() - - val = if idx_in_subgroup <= fld1(groupsize, subgroupsize) - @inbounds localmem[idx_in_subgroup] - else - neutral - end - - # second subgroup reduction to reduce partial results - if idx_in_subgroup == 1 - val = @subgroupreduce(op, val) - end - - return val -end - -@inline function __groupreduce(__ctx__, op, val, neutral, ::Type{T}, ::Val{false}) where {T} +@inline function __groupreduce(__ctx__, op, val, neutral, ::Type{T}) where {T} idx_in_group = @index(Local) groupsize = @groupsize()[1] diff --git a/test/reduce.jl b/test/reduce.jl index 7406b66d..cf418db0 100644 --- a/test/reduce.jl +++ b/test/reduce.jl @@ -8,7 +8,7 @@ using KernelAbstractions, Test val = a[idx_in_group] - val = @groupreduce(op, val, netral, false) + val = @groupreduce(op, val, netral) b[1] = val end From 45844cede9c3aa8dd7c1186202060c412fcb50f4 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 19 Sep 2023 10:44:22 -0400 Subject: [PATCH 13/14] cleanup and wire-up tests --- src/reduce.jl | 9 ++++----- test/reduce.jl | 35 +++++++++++++++++++---------------- test/testsuite.jl | 5 +++++ 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/reduce.jl b/src/reduce.jl index 6a385937..b17ecaba 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -1,13 +1,12 @@ export @groupreduce """ - @groupreduce(op, val, neutral, use_subgroups) Reduce values across a block - `op`: the operator of the reduction - `val`: value that each thread contibutes to the values that need to be reduced -- `netral`: value of the operator, so that `op(netural, neutral) = neutral`` +- `neutral`: value of the operator, so that `op(neutral, neutral) = neutral` - `use_subgroups`: make use of the subgroupreduction of the groupreduction """ macro groupreduce(op, val, neutral) @@ -17,8 +16,8 @@ macro groupreduce(op, val, neutral) end @inline function __groupreduce(__ctx__, op, val, neutral, ::Type{T}) where {T} - idx_in_group = @index(Local) - groupsize = @groupsize()[1] + idx_in_group = @index(Local, Linear) + groupsize = prod(@groupsize()) localmem = @localmem(T, groupsize) @@ -46,4 +45,4 @@ end end return val -end \ No newline at end of file +end diff --git a/test/reduce.jl b/test/reduce.jl index cf418db0..91537c40 100644 --- a/test/reduce.jl +++ b/test/reduce.jl @@ -1,34 +1,37 @@ using KernelAbstractions, Test - - - @kernel function reduce(a, b, op, neutral) - idx_in_group = @index(Local) - - val = a[idx_in_group] + I = @index(Global) + gI = @index(Group, Linear) + val = a[I] - val = @groupreduce(op, val, netral) + val = @groupreduce(op, val, neutral) - b[1] = val + b[gI] = val end -function(backend, ArrayT) +function reduce_testset(backend, ArrayT) @testset "groupreduce one group" begin - @testset for op in (+,*,max,min) + @testset for op in (+, *, max, min) @testset for type in (Int32, Float32, Float64) - @test test_1group_groupreduce(backend, ArrayT ,op, type, op(neutral)) + @test test_groupreduce(backend, ArrayT, op, type, op(neutral), 8) + @test test_groupreduce(backend, ArrayT, op, type, op(neutral), 16) + @test test_groupreduce(backend, ArrayT, op, type, op(neutral), 32) + @test test_groupreduce(backend, ArrayT, op, type, op(neutral), 64) end end end end -function test_1group_groupreduce(backend,ArrayT, op, type, neutral) - a = rand(type, 32) - b = ArrayT(a) +function test_groupreduce(backend, ArrayT, op, type, neutral, N) + a = rand(type, N) + b = ArrayT(a) - c = similar(b,1) - reduce(a, c, op, neutral) + gsz = 64 + ngroups = ceil(N/gsz) + c = similar(b, ngroups) + kernel = reduce(backend, (gsz,)) + kernel(a, c, op, neutral) expected = mapreduce(x->x^2, +, a) actual = c[1] diff --git a/test/testsuite.jl b/test/testsuite.jl index cd78e76d..c4a32b83 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -32,6 +32,7 @@ include("reflection.jl") include("examples.jl") include("convert.jl") include("specialfunctions.jl") +include("reduce.jl") function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{String}()) @conditional_testset "Unittests" skip_tests begin @@ -78,6 +79,10 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{ convert_testsuite(backend, AT) end + @conditional_testset "Reduce" skip_tests begin + reduce_testsuite(backend, AT) + end + @conditional_testset "Examples" skip_tests begin examples_testsuite(backend_str) end From c5dc35657eb36df9d4dbd6ac5b203da2fe6ad528 Mon Sep 17 00:00:00 2001 From: brabreda Date: Thu, 11 Jan 2024 21:21:56 +0100 Subject: [PATCH 14/14] fixup use of groupsize --- src/reduce.jl | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/reduce.jl b/src/reduce.jl index 6a385937..8170b4c4 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -7,30 +7,35 @@ export @groupreduce Reduce values across a block - `op`: the operator of the reduction - `val`: value that each thread contibutes to the values that need to be reduced -- `netral`: value of the operator, so that `op(netural, neutral) = neutral`` -- `use_subgroups`: make use of the subgroupreduction of the groupreduction +- `neutral`: value of the operator, so that `op(netural, neutral) = neutral`` +- `groupsize` (optional): specify the groupszie. If not specified @groupsize is used but this is generally slower. """ macro groupreduce(op, val, neutral) + quote + $__groupreduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), Val(prod(groupsize($(esc(:__ctx__)))))) + end +end + +macro groupreduce(op, val, neutral, groupsize) quote - $__groupreduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), $(esc(typeof(val)))) + $__groupreduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), $(esc(groupsize))) end end -@inline function __groupreduce(__ctx__, op, val, neutral, ::Type{T}) where {T} +@inline function __groupreduce(__ctx__, op, val::T, neutral, ::Val{GROUPSIZE}) where {T,GROUPSIZE} idx_in_group = @index(Local) - groupsize = @groupsize()[1] - localmem = @localmem(T, groupsize) + localmem = @localmem(T, GROUPSIZE) @inbounds localmem[idx_in_group] = val # perform the reduction d = 1 - while d < groupsize + while d < GROUPSIZE @synchronize() index = 2 * d * (idx_in_group-1) + 1 - @inbounds if index <= groupsize - other_val = if index + d <= groupsize + @inbounds if index <= GROUPSIZE + other_val = if index + d <= GROUPSIZE localmem[index+d] else neutral