Skip to content
This repository has been archived by the owner on Mar 12, 2021. It is now read-only.

Commit

Permalink
Clean-up reverse kernels.
Browse files Browse the repository at this point in the history
  • Loading branch information
maleadt committed Sep 16, 2019
1 parent a295347 commit d8e4a26
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 47 deletions.
108 changes: 61 additions & 47 deletions src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -280,104 +280,118 @@ end

## reversing

# with `start`/`stop` arguments
# 1-dimensional

function _reverse(input::CuVector{T}, output::CuVector{T}) where {T}
@assert length(input) == length(output)
# in-place, using shared memory to temporarily store values
function Base.reverse!(data::CuVector{T}, start=1, stop=length(data)) where {T}
range = start:stop

nthreads = 256
nblocks = ceil(Int, length(input) / nthreads)
shmem = nthreads * sizeof(T)

function kernel(input::CuDeviceVector{T}, output::CuDeviceVector{T}) where {T}
function kernel(data::CuDeviceVector{T}, range::UnitRange) where {T}
shared = @cuDynamicSharedMem(T, blockDim().x)

# load one element per thread from device memory and buffer it in reversed order

offset_in = blockDim().x * (blockIdx().x - 1)
offset_in = first(range) + blockDim().x * (blockIdx().x - 1) - 1
index_in = offset_in + threadIdx().x

if index_in <= length(input)
if index_in <= last(range)
index_shared = blockDim().x - threadIdx().x + 1
@inbounds shared[index_shared] = input[index_in]
@inbounds shared[index_shared] = data[index_in]
end

sync_threads()

# write back in forward order, but to the reversed block offset as before

offset_out = length(output) - blockDim().x * blockIdx().x
offset_out = last(range) - blockDim().x * blockIdx().x
index_out = offset_out + threadIdx().x

if 1 <= index_out <= length(output)
if index_out in range
index_shared = threadIdx().x
@inbounds output[index_out] = shared[index_shared]
@inbounds data[index_out] = shared[index_shared]
end

return
end

@cuda threads=nthreads blocks=nblocks shmem=shmem kernel(input, output)
nthreads = 256
nblocks = ceil(Int, length(range) / nthreads)
shmem = nthreads * sizeof(T)

@cuda threads=nthreads blocks=nblocks shmem=shmem kernel(data, range)

return
return data
end

function Base.reverse!(v::CuVector, start=1, stop=length(v))
v′ = view(v, start:stop)
_reverse(v′, v′)
return v
end
# out-of-place
function Base.reverse(input::CuVector{T}, start=1, stop=length(input)) where {T}
range = start:stop

output = similar(input)
start > 1 && copyto!(output, 1, input, 1, start-1)
stop < length(input) && copyto!(output, stop+1, input, stop+1)

function kernel(input::CuDeviceVector{T}, output::CuDeviceVector{T}, range::UnitRange) where {T}
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x

index_in = first(range) + index - 1
index_out = last(range) - index + 1

if index_in <= last(range)
@inbounds output[index_out] = input[index_in]
end

return
end

nthreads = 256
nblocks = ceil(Int, length(range) / nthreads)
shmem = nthreads * sizeof(T)

@cuda threads=nthreads blocks=nblocks shmem=shmem kernel(input, output, range)

function Base.reverse(v::CuVector, start=1, stop=length(v))
v′ = similar(v)
start > 1 && copyto!(v′, 1, v, 1, start-1)
_reverse(view(v, start:stop), view(v′, start:stop))
stop < length(v) && copyto!(v′, stop+1, v, stop+1)
return v′
return output
end

# with `dims` argument
# n-dimensional

function Base.reverse!(input::CuArray{T, N}, output::CuArray{T, N}; dims::Integer) where {T, N}
function Base.reverse(input::CuArray{T, N}; dims::Integer) where {T, N}
if !(1 dims length(size(input)))
ArgumentError("dimension $dims is not 1 ≤ $dims$length(size(input))")
end

if !all(size(input) .== size(output))
DimensionMismatch("input and output arrays are not of the same dimensions.")
end

nthreads = 256
nblocks = ceil(Int, length(input)/nthreads)
output = similar(input)

shape = [size(input)...]

numelemsinprevdims = prod(shape[1:dims-1])
numelemsincurrdim = shape[dims]

function kernel(input::CuDeviceArray{T, N}, output::CuDeviceArray{T, N}) where {T, N}
# The plan is to treat an ND Array as a 1D Array. An element at pos
# [i1, i2, i3, ... , i{x}, ..., i{n}], after reversing about the xth
# dimension, will be at [i1, i2, i3, ... , d{x} - i{x} + 1, ..., i{n}],
# where d_{x} is the size of the xth dimension. We use this information
# to find the location of the original element in 1D representation of the
# flipped array. ik is the index of an element in the original array along
# dimension that we will flip.
# treat an ND array as 1D
#
# after reversing by the xth dimension an element at pos
# [i1, i2, i3, ... , i{x}, ..., i{n}] will be at
# [i1, i2, i3, ... , d{x} - i{x} + 1, ..., i{n}], where
# d{x} is the size of the xth dimension

offset_in = blockDim().x * (blockIdx().x - 1)
index_in = offset_in + threadIdx().x

# the index of an element in the original array along dimension that we will flip
ik = ((UInt32(ceil(index_in / numelemsinprevdims)) - 1) % numelemsincurrdim) + 1
index_out = UInt32(index_in + (numelemsincurrdim - 2ik + 1) * numelemsinprevdims)

if 1 index_in length(input) && 1 index_out length(output)
@inbounds output[index_out] = input[index_in]
end

return
end

nthreads = 256
nblocks = ceil(Int, length(input)/nthreads)

@cuda threads=nthreads blocks=nblocks kernel(input, output)
end

function Base.reverse(input::CuArray{T, N}; dims::Integer) where {T, N}
output = similar(input)
reverse!(input, output; dims=dims)
return output
end
3 changes: 3 additions & 0 deletions test/base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -325,14 +325,17 @@ end
end

@testset "reverse" begin
# 1-d out-of-place
@test testf(x->reverse(x), rand(1000))
@test testf(x->reverse(x, 10), rand(1000))
@test testf(x->reverse(x, 10, 90), rand(1000))

# 1-d in-place
@test testf(x->reverse!(x), rand(1000))
@test testf(x->reverse!(x, 10), rand(1000))
@test testf(x->reverse!(x, 10, 90), rand(1000))

# n-d out-of-place
for shape in ([1, 2, 4, 3], [4, 2], [5], [2^5, 2^5, 2^5]),
dim in 1:length(shape)
@test testf(x->reverse(x; dims=dim), rand(shape...))
Expand Down

0 comments on commit d8e4a26

Please sign in to comment.