diff --git a/ext/JACCAMDGPU/JACCAMDGPU.jl b/ext/JACCAMDGPU/JACCAMDGPU.jl index dafc620..be7133a 100644 --- a/ext/JACCAMDGPU/JACCAMDGPU.jl +++ b/ext/JACCAMDGPU/JACCAMDGPU.jl @@ -1,15 +1,13 @@ module JACCAMDGPU using JACC, AMDGPU - -# overloaded array functions -include("array.jl") +using JACC: JACCArrayType # overloaded experimental functions include("JACCEXPERIMENTAL.jl") using .experimental -function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} +function JACC.parallel_for(::JACCArrayType{<:ROCArray}, N::Integer, f::Function, x...) numThreads = 512 threads = min(N, numThreads) blocks = ceil(Int, N / threads) @@ -17,8 +15,8 @@ function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} AMDGPU.synchronize() end -function JACC.parallel_for( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} +function JACC.parallel_for(::JACCArrayType{<:ROCArray}, + (M, N)::Tuple{Integer, Integer}, f::Function, x...) numThreads = 16 Mthreads = min(M, numThreads) Nthreads = min(N, numThreads) @@ -29,8 +27,8 @@ function JACC.parallel_for( AMDGPU.synchronize() end -function JACC.parallel_reduce( - N::I, f::F, x...) where {I <: Integer, F <: Function} +function JACC.parallel_reduce(::JACCArrayType{<:ROCArray}, + N::Integer, f::Function, x...) numThreads = 512 threads = min(N, numThreads) blocks = ceil(Int, N / threads) @@ -45,8 +43,8 @@ function JACC.parallel_reduce( return rret end -function JACC.parallel_reduce( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} +function JACC.parallel_reduce(::JACCArrayType{<:ROCArray}, + (M, N)::Tuple{Integer, Integer}, f::Function, x...) numThreads = 16 Mthreads = min(M, numThreads) Nthreads = min(N, numThreads) @@ -313,8 +311,9 @@ function reduce_kernel_amdgpu_MN((M, N), red, ret) return nothing end +JACC.arraytype(::Val{:amdgpu}) = ROCArray + function __init__() - const JACC.Array = AMDGPU.ROCArray{T, N} where {T, N} end end # module JACCAMDGPU diff --git a/ext/JACCAMDGPU/JACCEXPERIMENTAL.jl b/ext/JACCAMDGPU/JACCEXPERIMENTAL.jl index 5325bbd..328a545 100644 --- a/ext/JACCAMDGPU/JACCEXPERIMENTAL.jl +++ b/ext/JACCAMDGPU/JACCEXPERIMENTAL.jl @@ -2,7 +2,8 @@ module experimental using JACC, AMDGPU -function JACC.experimental.shared(x::ROCDeviceArray{T,N}) where {T,N} +function JACC.experimental.shared(x::ROCDeviceArray) + T = eltype(x) size = length(x) shmem = @ROCDynamicLocalArray(T, size) num_threads = workgroupDim().x * workgroupDim().y diff --git a/ext/JACCAMDGPU/array.jl b/ext/JACCAMDGPU/array.jl deleted file mode 100644 index b826080..0000000 --- a/ext/JACCAMDGPU/array.jl +++ /dev/null @@ -1,8 +0,0 @@ - -function JACC.zeros(T, dims...) - return AMDGPU.zeros(T, dims...) -end - -function JACC.ones(T, dims...) - return AMDGPU.ones(T, dims...) -end diff --git a/ext/JACCCUDA/JACCCUDA.jl b/ext/JACCCUDA/JACCCUDA.jl index 21b5dc9..e6b1ad3 100644 --- a/ext/JACCCUDA/JACCCUDA.jl +++ b/ext/JACCCUDA/JACCCUDA.jl @@ -1,15 +1,12 @@ module JACCCUDA using JACC, CUDA +using JACC: JACCArrayType -# overloaded array functions -include("array.jl") - -# overloaded experimental functions include("JACCEXPERIMENTAL.jl") using .experimental -function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} +function JACC.parallel_for(::JACCArrayType{<:CuArray}, N::Integer, f::Function, x...) parallel_args = (N, f, x...) parallel_kargs = cudaconvert.(parallel_args) parallel_tt = Tuple{Core.Typeof.(parallel_kargs)...} @@ -20,8 +17,8 @@ function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} parallel_kernel(parallel_kargs...; threads = threads, blocks = blocks) end -function JACC.parallel_for( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} +function JACC.parallel_for(::JACCArrayType{<:CuArray}, + (M, N)::Tuple{Integer, Integer}, f::Function, x...) #To use JACC.shared, it is recommended to use a high number of threads per block to maximize the # potential benefit from using shared memory. #numThreads = 32 @@ -37,8 +34,8 @@ function JACC.parallel_for( # f, x...) end -function JACC.parallel_reduce( - N::I, f::F, x...) where {I <: Integer, F <: Function} +function JACC.parallel_reduce(::JACCArrayType{<:CuArray}, + N::Integer, f::Function, x...) numThreads = 512 threads = min(N, numThreads) blocks = ceil(Int, N / threads) @@ -51,8 +48,8 @@ function JACC.parallel_reduce( return rret end -function JACC.parallel_reduce( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} +function JACC.parallel_reduce(::JACCArrayType{<:CuArray}, + (M, N)::Tuple{Integer, Integer}, f::Function, x...) numThreads = 16 Mthreads = min(M, numThreads) Nthreads = min(N, numThreads) @@ -322,8 +319,9 @@ function reduce_kernel_cuda_MN((M, N), red, ret) return nothing end +JACC.arraytype(::Val{:cuda}) = CuArray + function __init__() - const JACC.Array = CUDA.CuArray{T, N} where {T, N} end end # module JACCCUDA diff --git a/ext/JACCCUDA/JACCEXPERIMENTAL.jl b/ext/JACCCUDA/JACCEXPERIMENTAL.jl index 490afb3..9862074 100644 --- a/ext/JACCCUDA/JACCEXPERIMENTAL.jl +++ b/ext/JACCCUDA/JACCEXPERIMENTAL.jl @@ -2,7 +2,8 @@ module experimental using JACC, CUDA -function JACC.experimental.shared(x::CuDeviceArray{T,N}) where {T,N} +function JACC.experimental.shared(x::CuDeviceArray) + T = eltype(x) size = length(x) shmem = @cuDynamicSharedMem(T, size) num_threads = blockDim().x * blockDim().y diff --git a/ext/JACCCUDA/array.jl b/ext/JACCCUDA/array.jl deleted file mode 100644 index 5cf21e0..0000000 --- a/ext/JACCCUDA/array.jl +++ /dev/null @@ -1,8 +0,0 @@ - -function JACC.zeros(T, dims...) - return CUDA.zeros(T, dims...) -end - -function JACC.ones(T, dims...) - return CUDA.ones(T, dims...) -end diff --git a/ext/JACCONEAPI/JACCEXPERIMENTAL.jl b/ext/JACCONEAPI/JACCEXPERIMENTAL.jl index 8225540..12cf337 100644 --- a/ext/JACCONEAPI/JACCEXPERIMENTAL.jl +++ b/ext/JACCONEAPI/JACCEXPERIMENTAL.jl @@ -2,7 +2,8 @@ module experimental using JACC, oneAPI -function JACC.experimental.shared(x::oneDeviceArray{T,N}) where {T,N} +function JACC.experimental.shared(x::oneDeviceArray) + T = eltype(x) size = length(x) shmem = oneLocalArray(T, size) num_threads = get_local_size(0) * get_local_size(1) diff --git a/ext/JACCONEAPI/JACCONEAPI.jl b/ext/JACCONEAPI/JACCONEAPI.jl index f138383..d897e54 100644 --- a/ext/JACCONEAPI/JACCONEAPI.jl +++ b/ext/JACCONEAPI/JACCONEAPI.jl @@ -1,15 +1,13 @@ module JACCONEAPI using JACC, oneAPI - -# overloaded array functions -include("array.jl") +using JACC: JACCArrayType # overloaded experimental functions include("JACCEXPERIMENTAL.jl") using .experimental -function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} +function JACC.parallel_for(::JACCArrayType{<:oneArray}, N::Integer, f::Function, x...) #maxPossibleItems = oneAPI.oneL0.compute_properties(device().maxTotalGroupSize) maxPossibleItems = 256 items = min(N, maxPossibleItems) @@ -17,8 +15,8 @@ function JACC.parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} oneAPI.@sync @oneapi items=items groups=groups _parallel_for_oneapi(f, x...) end -function JACC.parallel_for( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} +function JACC.parallel_for(::JACCArrayType{<:oneArray}, + (M, N)::Tuple{Integer, Integer}, f::Function, x...) maxPossibleItems = 16 Mitems = min(M, maxPossibleItems) Nitems = min(N, maxPossibleItems) @@ -28,8 +26,8 @@ function JACC.parallel_for( f, x...) end -function JACC.parallel_reduce( - N::I, f::F, x...) where {I <: Integer, F <: Function} +function JACC.parallel_reduce(::JACCArrayType{<:oneArray}, + N::Integer, f::Function, x...) numItems = 256 items = min(N, numItems) groups = ceil(Int, N / items) @@ -41,8 +39,8 @@ function JACC.parallel_reduce( return rret end -function JACC.parallel_reduce( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} +function JACC.parallel_reduce(::JACCArrayType{<:oneArray}, + (M, N)::Tuple{Integer, Integer}, f::Function, x...) numItems = 16 Mitems = min(M, numItems) Nitems = min(N, numItems) @@ -306,8 +304,9 @@ function reduce_kernel_oneapi_MN((M, N), red, ret) return nothing end +JACC.arraytype(::Val{:oneapi}) = oneArray + function __init__() - const JACC.Array = oneAPI.oneArray{T, N} where {T, N} end end # module JACCONEAPI diff --git a/src/JACC.jl b/src/JACC.jl index bb7b474..9e3d8ac 100644 --- a/src/JACC.jl +++ b/src/JACC.jl @@ -1,68 +1,32 @@ __precompile__(false) module JACC -import Atomix: @atomic +using Atomix: @atomic # module to set back end preferences +include("JACCArrayType.jl") include("JACCPreferences.jl") include("helper.jl") # overloaded array functions include("array.jl") + include("JACCBLAS.jl") using .BLAS include("JACCEXPERIMENTAL.jl") using .experimental -export Array, @atomic export parallel_for -global Array - -function parallel_for(N::I, f::F, x...) where {I <: Integer, F <: Function} - @maybe_threaded for i in 1:N - f(i, x...) - end -end - -function parallel_for( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} - @maybe_threaded for j in 1:N - for i in 1:M - f(i, j, x...) - end - end -end - -function parallel_reduce(N::I, f::F, x...) where {I <: Integer, F <: Function} - tmp = zeros(Threads.nthreads()) - ret = zeros(1) - @maybe_threaded for i in 1:N - tmp[Threads.threadid()] = tmp[Threads.threadid()] .+ f(i, x...) - end - for i in 1:Threads.nthreads() - ret = ret .+ tmp[i] - end - return ret +function parallel_for(N, f::Function, x...) + return parallel_for(JACCPreferences.JACC_BACKEND_TYPE(), N, f, x...) end -function parallel_reduce( - (M, N)::Tuple{I, I}, f::F, x...) where {I <: Integer, F <: Function} - tmp = zeros(Threads.nthreads()) - ret = zeros(1) - @maybe_threaded for j in 1:N - for i in 1:M - tmp[Threads.threadid()] = tmp[Threads.threadid()] .+ f(i, j, x...) - end - end - for i in 1:Threads.nthreads() - ret = ret .+ tmp[i] - end - return ret +function parallel_reduce(N, f::Function, x...) + return parallel_reduce(JACCPreferences.JACC_BACKEND_TYPE(), N, f, x...) end function __init__() - const JACC.Array = Base.Array{T, N} where {T, N} end end # module JACC diff --git a/src/JACCArrayType.jl b/src/JACCArrayType.jl new file mode 100644 index 0000000..637e147 --- /dev/null +++ b/src/JACCArrayType.jl @@ -0,0 +1,8 @@ +struct JACCArrayType{T} +end + +arraytype() = arraytype(Val(Symbol(JACCPreferences.backend))) +arraytype(::Val{:threads}) = Array +arraytype(::Val{T}) where T = error("The backend $(T) is either not recognized or the associated package is not loaded.") +arraytype(J::JACCArrayType) = arraytype(typeof(J)) +arraytype(::Type{<:JACCArrayType{T}}) where {T} = T \ No newline at end of file diff --git a/src/JACCBLAS.jl b/src/JACCBLAS.jl index 387b2ad..df9676c 100644 --- a/src/JACCBLAS.jl +++ b/src/JACCBLAS.jl @@ -10,11 +10,11 @@ function _dot(i, x, y) return @inbounds x[i] * y[i] end -function axpy(n::I, alpha, x, y) where {I<:Integer} +function axpy(n::Integer, alpha, x, y) JACC.parallel_for(n, _axpy, alpha, x, y) end -function dot(n::I, x, y) where {I<:Integer} +function dot(n::Integer, x, y) JACC.parallel_reduce(n, _dot, x, y) end diff --git a/src/JACCEXPERIMENTAL.jl b/src/JACCEXPERIMENTAL.jl index ea2f35c..7736ad3 100644 --- a/src/JACCEXPERIMENTAL.jl +++ b/src/JACCEXPERIMENTAL.jl @@ -2,7 +2,7 @@ module experimental using JACC -function shared(x::Base.Array{T,N}) where {T,N} +function shared(x) return x end diff --git a/src/JACCPreferences.jl b/src/JACCPreferences.jl index f66ddce..5429946 100644 --- a/src/JACCPreferences.jl +++ b/src/JACCPreferences.jl @@ -17,4 +17,10 @@ end const backend = @load_preference("backend", "threads") +using JACC: JACCArrayType, arraytype + +function JACC_BACKEND_TYPE() + return JACCArrayType{arraytype(Val(Symbol(JACCPreferences.backend)))}() +end + end # module JACCPreferences diff --git a/src/array.jl b/src/array.jl index 1c1e703..5bedf9a 100644 --- a/src/array.jl +++ b/src/array.jl @@ -1,8 +1,52 @@ function zeros(T, dims...) - return Base.zeros(T, dims...) + return fill!(similar(arraytype(){T}, dims...), zero(T)) end function ones(T, dims...) - return Base.ones(T, dims...) + return fill!(similar(arraytype(){T}, dims...), one(T)) +end + +array(T::AbstractArray) = arraytype()(T) + +function parallel_for(::JACCArrayType{<:Array}, N::Integer, f::Function, x...) + @maybe_threaded for i in 1:N + f(i, x...) + end +end + +function parallel_for(::JACCArrayType{<:Array}, + (M, N)::Tuple{Integer, Integer}, f::Function, x...) + @maybe_threaded for j in 1:N + for i in 1:M + f(i, j, x...) + end + end +end + +function parallel_reduce(::JACCArrayType{<:Array}, N::Integer, f::Function, x...) + tmp = Base.zeros(Threads.nthreads()) + ret = Base.zeros(1) + @maybe_threaded for i in 1:N + tmp[Threads.threadid()] = tmp[Threads.threadid()] .+ f(i, x...) + end + for i in 1:Threads.nthreads() + ret = ret .+ tmp[i] + end + return ret +end + +function parallel_reduce(::JACCArrayType{<:Array}, + (M, N)::Tuple{Integer, Integer}, f::Function, x...) + tmp = Base.zeros(Threads.nthreads()) + ret = Base.zeros(1) + @maybe_threaded for j in 1:N + for i in 1:M + tmp[Threads.threadid()] = tmp[Threads.threadid()] .+ f(i, j, x...) + end + end + for i in 1:Threads.nthreads() + ret = ret .+ tmp[i] + end + return ret end diff --git a/test/runtests.jl b/test/runtests.jl index 93566ec..e6017bd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,23 +3,32 @@ import JACC using Pkg const backend = JACC.JACCPreferences.backend +using Test: @test @static if backend == "cuda" Pkg.add(; name = "CUDA", version = "v5.1.1") @show "CUDA backend loaded" + using CUDA: CuArray + @test JACC.arraytype() <: CuArray include("tests_cuda.jl") elseif backend == "amdgpu" - Pkg.add(; name = "AMDGPU", version = "v0.8.6") + Pkg.add(; name = "AMDGPU") @show "AMDGPU backend loaded" + using AMDGPU: ROCArray + @test JACC.arraytype() <: ROCArray include("tests_amdgpu.jl") elseif backend == "oneapi" Pkg.add("oneAPI") @show "OneAPI backend loaded" + using OneAPI: oneArray + @test JACC.arraytype() <: oneArray include("tests_oneapi.jl") elseif backend == "threads" @show "Threads backend loaded" - include("tests_threads.jl") + @test JACC.arraytype() <: Array end + +include("tests_threads.jl") diff --git a/test/tests_amdgpu.jl b/test/tests_amdgpu.jl index f5bd322..cc23601 100644 --- a/test/tests_amdgpu.jl +++ b/test/tests_amdgpu.jl @@ -1,5 +1,5 @@ -import AMDGPU -import JACC +using AMDGPU +using JACC using Test @testset "TestBackend" begin @@ -15,7 +15,7 @@ end dims = (N) a = round.(rand(Float32, dims) * 100) - a_device = JACC.Array(a) + a_device = JACC.array(a) JACC.parallel_for(N, f, a_device) a_expected = a .+ 5.0 @@ -39,8 +39,8 @@ end y = round.(rand(Float32, N) * 100) alpha = 2.5 - x_device = JACC.Array(x) - y_device = JACC.Array(y) + x_device = JACC.array(x) + y_device = JACC.array(y) JACC.parallel_for(N, axpy, alpha, x_device, y_device) x_expected = x @@ -59,9 +59,9 @@ end # Generate random vectors x and y of length N for the interval [0, 100] alpha = 2.5 - x = JACC.Array(round.(rand(Float32, N) * 100)) - y = JACC.Array(round.(rand(Float32, N) * 100)) - counter = JACC.Array{Int32}([0]) + x = JACC.array(round.(rand(Float32, N) * 100)) + y = JACC.array(round.(rand(Float32, N) * 100)) + counter = JACC.array(Int32[0]) JACC.parallel_for(N, axpy_counter!, alpha, x, y, counter) @test Array(counter)[1] == N @@ -97,7 +97,7 @@ end @test zeros(N)≈Array(x) rtol=1e-5 end -#@testset "JACC.BLAS" begin +# @testset "JACC.BLAS" begin # function seq_axpy(N, alpha, x, y) # for i in 1:N @@ -113,10 +113,11 @@ end # return r # end -# x = ones(1_000) -# y = ones(1_000) -# jx = JACC.ones(1_000) -# jy = JACC.ones(1_000) +# elt = Float64 +# x = ones(elt, 1_000) +# y = ones(elt, 1_000) +# jx = JACC.ones(elt, 1_000) +# jy = JACC.ones(elt, 1_000) # alpha = 2.0 # seq_axpy(1_000, alpha, x, y) @@ -128,4 +129,4 @@ end # @test result[1]≈ref_result rtol=1e-8 -#end +# end diff --git a/test/tests_cuda.jl b/test/tests_cuda.jl index a1686be..97c4035 100644 --- a/test/tests_cuda.jl +++ b/test/tests_cuda.jl @@ -1,5 +1,5 @@ using CUDA -import JACC +using JACC using Test @testset "TestBackend" begin @@ -15,7 +15,7 @@ end dims = (N) a = round.(rand(Float32, dims) * 100) - a_device = JACC.Array(a) + a_device = JACC.array(a) JACC.parallel_for(N, f, a_device) a_expected = a .+ 5.0 @@ -39,8 +39,8 @@ end y = round.(rand(Float32, N) * 100) alpha = 2.5 - x_device = JACC.Array(x) - y_device = JACC.Array(y) + x_device = JACC.array(x) + y_device = JACC.array(y) JACC.parallel_for(N, axpy, alpha, x_device, y_device) x_expected = x @@ -59,9 +59,9 @@ end # Generate random vectors x and y of length N for the interval [0, 100] alpha = 2.5 - x = JACC.Array(round.(rand(Float32, N) * 100)) - y = JACC.Array(round.(rand(Float32, N) * 100)) - counter = JACC.Array{Int32}([0]) + x = JACC.array(round.(rand(Float32, N) * 100)) + y = JACC.array(round.(rand(Float32, N) * 100)) + counter = JACC.array(Int32[0]) JACC.parallel_for(N, axpy_counter!, alpha, x, y, counter) @test Array(counter)[1] == N @@ -150,10 +150,11 @@ end return r end - x = ones(1_000) - y = ones(1_000) - jx = JACC.ones(1_000) - jy = JACC.ones(1_000) + elt = Float64 + x = ones(elt, 1_000) + y = ones(elt, 1_000) + jx = JACC.ones(elt, 1_000) + jy = JACC.ones(elt, 1_000) alpha = 2.0 seq_axpy(1_000, alpha, x, y) diff --git a/test/tests_oneapi.jl b/test/tests_oneapi.jl index bc4af4f..075c385 100644 --- a/test/tests_oneapi.jl +++ b/test/tests_oneapi.jl @@ -15,7 +15,7 @@ end dims = (N) a = round.(rand(Float32, dims) * 100) - a_device = JACC.Array(a) + a_device = JACC.array(a) JACC.parallel_for(N, f, a_device) a_expected = a .+ 5.0 @@ -39,8 +39,8 @@ end y = round.(rand(Float32, N) * 100) alpha::Float32 = 2.5 - x_device = JACC.Array(x) - y_device = JACC.Array(y) + x_device = JACC.array(x) + y_device = JACC.array(y) JACC.parallel_for(N, axpy, alpha, x_device, y_device) x_expected = x diff --git a/test/tests_threads.jl b/test/tests_threads.jl index 80953a0..0d2ee29 100644 --- a/test/tests_threads.jl +++ b/test/tests_threads.jl @@ -1,10 +1,6 @@ import JACC using Test -@testset "TestBackend" begin - @test JACC.JACCPreferences.backend == "threads" -end - @testset "VectorAddLambda" begin function f(x, a) @inbounds a[x] += 5.0 @@ -14,9 +10,10 @@ end a = round.(rand(Float32, dims) * 100) a_expected = a .+ 5.0 + a = JACC.array(a) JACC.parallel_for(10, f, a) - @test a≈a_expected rtol=1e-5 + @test Array(a)≈a_expected rtol=1e-5 end @testset "AXPY" begin @@ -38,14 +35,14 @@ end y = round.(rand(Float32, N) * 100) alpha = 2.5 - x_host_JACC = JACC.Array(x) - y_host_JACC = JACC.Array(y) + x_host_JACC = JACC.array(x) + y_host_JACC = JACC.array(y) JACC.parallel_for(N, axpy, alpha, x_host_JACC, y_host_JACC) x_expected = x seq_axpy(N, alpha, x_expected, y) - @test x_host_JACC≈x_expected rtol=1e-1 + @test Array(x_host_JACC)≈x_expected rtol=1e-1 end @testset "AtomicCounter" begin @@ -59,45 +56,48 @@ end alpha = 2.5 counter = zeros(Int32, 1) - x_device = JACC.Array(round.(rand(Float32, N) * 100)) - y_device = JACC.Array(round.(rand(Float32, N) * 100)) - counter = JACC.Array{Int32}([0]) + x_device = JACC.array(round.(rand(Float32, N) * 100)) + y_device = JACC.array(round.(rand(Float32, N) * 100)) + counter = JACC.array(Int32[0]) JACC.parallel_for(N, axpy_counter!, alpha, x_device, y_device, counter) - @test counter[1] == N + @test Array(counter)[1] == N end @testset "zeros" begin + elt = Float32 N = 10 - x = JACC.zeros(Float32, N) - @test typeof(x) == Vector{Float32} - @test eltype(x) == Float32 - @test zeros(N)≈x rtol=1e-5 + x = JACC.zeros(elt, N) + @test typeof(x) <: JACC.arraytype(){elt,1} + @test eltype(x) == elt + @test JACC.arraytype()(zeros(N))≈x rtol=1e-5 function add_one(i, x) @inbounds x[i] += 1 end JACC.parallel_for(N, add_one, x) - @test ones(N)≈x rtol=1e-5 + @test JACC.arraytype()(ones(N)) ≈ x rtol=1e-5 end @testset "ones" begin + elt = Float64 N = 10 - x = JACC.ones(Float64, N) - @test typeof(x) == Vector{Float64} - @test eltype(x) == Float64 - @test ones(N)≈x rtol=1e-5 + x = JACC.ones(elt, N) + @test typeof(x) <: JACC.arraytype(){elt,1} + @test eltype(x) == elt + @test JACC.arraytype()(ones(N)) ≈ x rtol=1e-5 function minus_one(i, x) @inbounds x[i] -= 1 end JACC.parallel_for(N, minus_one, x) - @test zeros(N)≈x rtol=1e-5 + @test JACC.arraytype()(zeros(N)) ≈ x rtol=1e-5 end @testset "CG" begin + elt = Float64 function matvecmul(i, a1, a2, a3, x, y, SIZE) if i == 1 y[i] = a2[i] * x[i] + a1[i] * x[i + 1] @@ -117,21 +117,16 @@ end end SIZE = 10 - a0 = ones(SIZE) - a1 = ones(SIZE) - a2 = ones(SIZE) - r = ones(SIZE) - p = ones(SIZE) - s = zeros(SIZE) - x = zeros(SIZE) - r_old = zeros(SIZE) - r_aux = zeros(SIZE) + a0, a1, a2, r, p = JACC.ones.(elt, (SIZE, SIZE, SIZE, SIZE, SIZE)) + + s, x, r_old, r_aux = JACC.zeros.(elt, (SIZE, SIZE, SIZE, SIZE)) + a1 = a1 * 4 r = r * 0.5 p = p * 0.5 - global cond = one(Float64) + global cond = ones(Float64,1) - while cond[1, 1] >= 1e-14 + while Array(cond)[1] >= 1e-14 r_old = copy(r) JACC.parallel_for(SIZE, matvecmul, a0, a1, a2, p, s, SIZE) @@ -156,10 +151,11 @@ end global cond = ccond p = copy(r_aux) end - @test cond[1, 1] <= 1e-14 + @test Array(cond)[1] <= 1e-14 end @testset "LBM" begin + elt = Float64 function lbm_kernel(x, y, f, f1, f2, t, w, cx, cy, SIZE) u = 0.0 v = 0.0 @@ -237,53 +233,42 @@ end end SIZE = 10 - f = ones(SIZE * SIZE * 9) .* 2.0 - f1 = ones(SIZE * SIZE * 9) .* 3.0 - f2 = ones(SIZE * SIZE * 9) .* 4.0 - cx = zeros(9) - cy = zeros(9) - cx[1] = 0 - cy[1] = 0 - cx[2] = 1 - cy[2] = 0 - cx[3] = -1 - cy[3] = 0 - cx[4] = 0 - cy[4] = 1 - cx[5] = 0 - cy[5] = -1 - cx[6] = 1 - cy[6] = 1 - cx[7] = -1 - cy[7] = 1 - cx[8] = -1 - cy[8] = -1 - cx[9] = 1 - cy[9] = -1 + SIZE2 = SIZE * SIZE * 9 + f = ones(elt, SIZE2) .* 2.0 + f1 = ones(elt, SIZE2 ) .* 3.0 + f2 = ones(elt, SIZE2) .* 4.0 + cx = elt[0.0, 1.0, -1.0, 0.0, 0.0, 1.0, -1.0, -1.0, 1.0] + cy = elt[0.0, 0.0, 0.0, 1.0, -1.0, 1.0, 1.0, -1.0, -1.0] + w = ones(9) t = 1.0 - df = JACC.Array(f) - df1 = JACC.Array(f1) - df2 = JACC.Array(f2) - dcx = JACC.Array(cx) - dcy = JACC.Array(cy) - dw = JACC.Array(w) + df = JACC.array(f) + df1 = JACC.array(f1) + df2 = JACC.array(f2) + dcx = JACC.array(cx) + dcy = JACC.array(cy) + dw = JACC.array(w) JACC.parallel_for( (SIZE, SIZE), lbm_kernel, df, df1, df2, t, dw, dcx, dcy, SIZE) lbm_threads(f, f1, f2, t, w, cx, cy, SIZE) - @test f2≈df2 rtol=1e-1 + @test JACC.arraytype()(f2) ≈ df2 rtol=1e-1 end @testset "JACC.BLAS" begin - + if JACC.JACCPreferences.backend == "amdgpu" + ## There is an error in AMDGPU when blocks ≠ 1 which causes JACC.BLAS to fail + return + end + elt = Float64 + x = ones(1_000) y = ones(1_000) - jx = JACC.ones(1_000) - jy = JACC.ones(1_000) + jx = JACC.ones(elt, 1_000) + jy = JACC.ones(elt, 1_000) alpha = 2.0 function seq_axpy(N, alpha, x, y) @@ -305,7 +290,7 @@ end JACC.BLAS.axpy(1_000, alpha, jx, jy) jresult = JACC.BLAS.dot(1_000, jx, jy) - result = jresult[1] + result = Array(jresult)[1] @test result≈ref_result rtol=1e-8