Skip to content

Commit

Permalink
Merge pull request #11 from microscopic-image-analysis/formatting
Browse files Browse the repository at this point in the history
Formatting
  • Loading branch information
trahflow authored May 6, 2024
2 parents 4ccfcc3 + 000093c commit e8dd09d
Show file tree
Hide file tree
Showing 14 changed files with 849 additions and 409 deletions.
2 changes: 2 additions & 0 deletions .JuliaFormatter.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
style = "blue"
always_use_return = false
22 changes: 17 additions & 5 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
name = "DiffPointRasterisation"
uuid = "f984992d-3c45-4382-99a1-cf20f5c47c61"
authors = ["Wolfhart Feldmeier <wolfhart.feldmeier@uni-jena.de>"]
version = "0.2.0"
version = "0.2.1"

[deps]
ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
ChunkSplitters = "ae650224-84b6-46f8-82ea-d812ca08434e"
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
Rotations = "6038ab10-8711-5258-84ad-4b1120ba62dc"
SimpleUnPack = "ce78b400-467f-4804-87d8-8f486da07d0a"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
Expand All @@ -23,22 +22,35 @@ DiffPointRasterisationCUDAExt = "CUDA"
DiffPointRasterisationChainRulesCoreExt = "ChainRulesCore"

[compat]
Adapt = "4"
Aqua = "0.8"
ArgCheck = "2.3"
Atomix = "0.1"
BenchmarkTools = "1"
CUDA = "5.3"
ChainRulesCore = "1.23"
ChainRulesTestUtils = "1.12"
ChunkSplitters = "2"
FillArrays = "1.9.3"
Rotations = "1.6.1"
KernelAbstractions = "0.9.18"
Rotations = "1.7"
SimpleUnPack = "1.1"
StaticArrays = "1.9.1"
Test = "1"
TestItemRunner = "0.2"
TestItems = "0.1.1"
julia = "1"
julia = "^1.9"

[extras]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
Rotations = "6038ab10-8711-5258-84ad-4b1120ba62dc"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"

[targets]
test = ["Adapt", "BenchmarkTools", "ChainRulesCore", "ChainRulesTestUtils", "CUDA", "Test", "TestItemRunner"]
test = ["Adapt", "Aqua", "BenchmarkTools", "ChainRulesCore", "ChainRulesTestUtils", "CUDA", "Rotations", "Test", "TestItemRunner"]
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

[![Build Status](https://github.com/microscopic-image-analysis/DiffPointRasterisation.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/microscopic-image-analysis/DiffPointRasterisation.jl/actions/workflows/CI.yml?query=branch%3Amain)
[![](https://img.shields.io/badge/docs-main-blue.svg)](https://microscopic-image-analysis.github.io/DiffPointRasterisation.jl/dev)
[![Aqua QA](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl)

![](logo.gif)

Expand Down
131 changes: 83 additions & 48 deletions ext/DiffPointRasterisationCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,16 @@ using ArgCheck
using FillArrays
using StaticArrays

const CuOrFillArray{T,N} = Union{CuArray{T,N},FillArrays.AbstractFill{T,N}}

const CuOrFillArray{T, N} = Union{CuArray{T, N}, FillArrays.AbstractFill{T, N}}


const CuOrFillVector{T} = CuOrFillArray{T, 1}

const CuOrFillVector{T} = CuOrFillArray{T,1}

function raster_pullback_kernel!(
::Type{T},
ds_dout,
points::AbstractVector{<:StaticVector{N_in}},
rotations::AbstractVector{<:StaticMatrix{N_out, N_in, TR}},
translations::AbstractVector{<:StaticVector{N_out, TT}},
rotations::AbstractVector{<:StaticMatrix{N_out,N_in,TR}},
translations::AbstractVector{<:StaticVector{N_out,TT}},
out_weights,
point_weights,
shifts,
Expand All @@ -35,8 +32,7 @@ function raster_pullback_kernel!(
ds_dtranslation,
ds_dout_weight,
ds_dpoint_weight,

) where {T, TR, TT, N_in, N_out}
) where {T,TR,TT,N_in,N_out}
n_voxel = blockDim().z
points_per_workgroup = blockDim().x
batchsize_per_workgroup = blockDim().y
Expand Down Expand Up @@ -74,24 +70,27 @@ function raster_pullback_kernel!(
origin = (-@SVector ones(TT, N_out)) - translation

coord_reference_voxel, deltas = DiffPointRasterisation.reference_coordinate_and_deltas(
point,
rotation,
origin,
scale,
point, rotation, origin, scale
)
voxel_idx = CartesianIndex(
CartesianIndex(Tuple(coord_reference_voxel)) + CartesianIndex(shift), batch_idx
)
voxel_idx = CartesianIndex(CartesianIndex(Tuple(coord_reference_voxel)) + CartesianIndex(shift), batch_idx)


ds_dweight_local = zero(T)
if voxel_idx in CartesianIndices(ds_dout)
@inbounds ds_dweight_local = DiffPointRasterisation.voxel_weight(
deltas,
shift,
ds_dout[voxel_idx],
deltas, shift, ds_dout[voxel_idx]
)

factor = ds_dout[voxel_idx] * out_weight * point_weight
ds_dcoord_part = SVector(factor .* ntuple(n -> DiffPointRasterisation.interpolation_weight(n, N_out, deltas, shift), Val(N_out)))
ds_dcoord_part = SVector(
factor .* ntuple(
n -> DiffPointRasterisation.interpolation_weight(
n, N_out, deltas, shift
),
Val(N_out),
),
)
@inbounds ds_dpoint_rot_shared[:, s, b] .= ds_dcoord_part .* scale
else
@inbounds ds_dpoint_rot_shared[:, s, b] .= zero(T)
Expand Down Expand Up @@ -136,7 +135,7 @@ function raster_pullback_kernel!(
j = 1
while j <= N_in
val = coef * point[j]
@inbounds CUDA.@atomic ds_drotation[dim, j, batch_idx] += val
@inbounds CUDA.@atomic ds_drotation[dim, j, batch_idx] += val
j += 1
end
end
Expand All @@ -161,7 +160,7 @@ function raster_pullback_kernel!(
sync_threads()
idx = 2 * stride * (b - 1) + 1
if idx <= batchsize_per_workgroup
dim = s
dim = s
while dim <= N_in
other_val_p = if idx + stride <= batchsize_per_workgroup
ds_dpoint_shared[dim, idx + stride]
Expand All @@ -181,7 +180,7 @@ function raster_pullback_kernel!(
sync_threads()
idx = 2 * stride * (thread - 1) + 1
if idx <= n_threads_per_workgroup
other_val_w = if idx + stride <= n_threads_per_workgroup
other_val_w = if idx + stride <= n_threads_per_workgroup
ds_dpoint_weight_shared[idx + stride]
else
zero(T)
Expand All @@ -207,74 +206,103 @@ function raster_pullback_kernel!(
@inbounds CUDA.@atomic ds_dpoint_weight[point_idx] += val_w
end

nothing
return nothing
end

# single image
raster_pullback!(
ds_dout::CuArray{<:Number, N_out},
points::AbstractVector{<:StaticVector{N_in, <:Number}},
rotation::StaticMatrix{N_out, N_in, <:Number},
translation::StaticVector{N_out, <:Number},
function raster_pullback!(
ds_dout::CuArray{<:Number,N_out},
points::AbstractVector{<:StaticVector{N_in,<:Number}},
rotation::StaticMatrix{N_out,N_in,<:Number},
translation::StaticVector{N_out,<:Number},
background::Number,
out_weight::Number,
point_weight::CuOrFillVector{<:Number},
ds_dpoints::AbstractMatrix{<:Number},
ds_dpoint_weight::AbstractVector{<:Number};
kwargs...
) where {N_in, N_out} = error("Not implemented: raster_pullback! for single image not implemented on GPU. Consider using CPU arrays")
kwargs...,
) where {N_in,N_out}
return error(
"Not implemented: raster_pullback! for single image not implemented on GPU. Consider using CPU arrays",
)
end

# batch of images
function DiffPointRasterisation.raster_pullback!(
ds_dout::CuArray{<:Number, N_out_p1},
points::CuVector{<:StaticVector{N_in, <:Number}},
rotation::CuVector{<:StaticMatrix{N_out, N_in, <:Number}},
translation::CuVector{<:StaticVector{N_out, <:Number}},
ds_dout::CuArray{<:Number,N_out_p1},
points::CuVector{<:StaticVector{N_in,<:Number}},
rotation::CuVector{<:StaticMatrix{N_out,N_in,<:Number}},
translation::CuVector{<:StaticVector{N_out,<:Number}},
background::CuOrFillVector{<:Number},
out_weight::CuOrFillVector{<:Number},
point_weight::CuOrFillVector{<:Number},
ds_dpoints::CuMatrix{TP},
ds_drotation::CuArray{TR, 3},
ds_drotation::CuArray{TR,3},
ds_dtranslation::CuMatrix{TT},
ds_dbackground::CuVector{<:Number},
ds_dout_weight::CuVector{OW},
ds_dpoint_weight::CuVector{PW},
) where {N_in, N_out, N_out_p1, TP<:Number, TR<:Number, TT<:Number, OW<:Number, PW<:Number}
) where {N_in,N_out,N_out_p1,TP<:Number,TR<:Number,TT<:Number,OW<:Number,PW<:Number}
T = promote_type(eltype(ds_dout), TP, TR, TT, OW, PW)
batch_axis = axes(ds_dout, N_out_p1)
@argcheck N_out == N_out_p1 - 1
@argcheck batch_axis == axes(rotation, 1) == axes(translation, 1) == axes(background, 1) == axes(out_weight, 1)
@argcheck batch_axis == axes(ds_drotation, 3) == axes(ds_dtranslation, 2) == axes(ds_dbackground, 1) == axes(ds_dout_weight, 1)
@argcheck batch_axis ==
axes(rotation, 1) ==
axes(translation, 1) ==
axes(background, 1) ==
axes(out_weight, 1)
@argcheck batch_axis ==
axes(ds_drotation, 3) ==
axes(ds_dtranslation, 2) ==
axes(ds_dbackground, 1) ==
axes(ds_dout_weight, 1)
@argcheck N_out == N_out_p1 - 1

n_points = length(points)
@argcheck length(ds_dpoint_weight) == n_points
batch_size = length(batch_axis)

ds_dbackground = vec(sum!(reshape(ds_dbackground, ntuple(_ -> 1, Val(N_out))..., batch_size), ds_dout))
ds_dbackground = vec(
sum!(reshape(ds_dbackground, ntuple(_ -> 1, Val(N_out))..., batch_size), ds_dout)
)

scale = SVector{N_out, T}(size(ds_dout)[1:end-1]) / T(2)
shifts=DiffPointRasterisation.voxel_shifts(Val(N_out))
scale = SVector{N_out,T}(size(ds_dout)[1:(end - 1)]) / T(2)
shifts = DiffPointRasterisation.voxel_shifts(Val(N_out))

ds_dpoints = fill!(ds_dpoints, zero(TP))
ds_drotation = fill!(ds_drotation, zero(TR))
ds_dtranslation = fill!(ds_dtranslation, zero(TT))
ds_dout_weight = fill!(ds_dout_weight, zero(OW))
ds_dpoint_weight = fill!(ds_dpoint_weight, zero(PW))

args = (T, ds_dout, points, rotation, translation, out_weight, point_weight, shifts, scale, ds_dpoints, ds_drotation, ds_dtranslation, ds_dout_weight, ds_dpoint_weight)
args = (
T,
ds_dout,
points,
rotation,
translation,
out_weight,
point_weight,
shifts,
scale,
ds_dpoints,
ds_drotation,
ds_dtranslation,
ds_dout_weight,
ds_dpoint_weight,
)

ndrange = (n_points, batch_size, 2^N_out)

workgroup_size(threads) = (1, min(threads ÷ (2^N_out), batch_size), 2^N_out)

function shmem(threads)
_, bs_p_wg, n_voxel = workgroup_size(threads)
((N_out + 1) * n_voxel + N_in) * bs_p_wg * sizeof(T)
_, bs_p_wg, n_voxel = workgroup_size(threads)
return ((N_out + 1) * n_voxel + N_in) * bs_p_wg * sizeof(T)
# ((N_out + 1) * threads + N_in * bs_p_wg) * sizeof(T)
end

let kernel = @cuda launch=false raster_pullback_kernel!(args...)
let kernel = @cuda launch = false raster_pullback_kernel!(args...)
config = CUDA.launch_configuration(kernel.fun; shmem)
workgroup_sz = workgroup_size(config.threads)
blocks = cld.(ndrange, workgroup_sz)
Expand All @@ -292,9 +320,16 @@ function DiffPointRasterisation.raster_pullback!(
)
end

function DiffPointRasterisation.default_ds_dpoints_batched(
points::CuVector{<:AbstractVector{TP}}, N_in, batch_size
) where {TP<:Number}
return similar(points, TP, (N_in, length(points)))
end

DiffPointRasterisation.default_ds_dpoints_batched(points::CuVector{<:AbstractVector{TP}}, N_in, batch_size) where {TP<:Number} = similar(points, TP, (N_in, length(points)))

DiffPointRasterisation.default_ds_dpoint_weight_batched(points::CuVector{<:AbstractVector{<:Number}}, T, batch_size) = similar(points, T)
function DiffPointRasterisation.default_ds_dpoint_weight_batched(
points::CuVector{<:AbstractVector{<:Number}}, T, batch_size
)
return similar(points, T)
end

end # module
Loading

0 comments on commit e8dd09d

Please sign in to comment.