From 654fdd8caebed72f116568f8116ace0023e6a364 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 28 Jul 2017 16:41:38 +0200 Subject: [PATCH] Introduce DevicePtr to track address space of pointers. Optimize loads and stores using LLVM.jl. --- REQUIRE | 4 +- src/CUDAnative.jl | 15 ++- src/{device/util.jl => cgutils.jl} | 31 +++++- src/device/array.jl | 81 ++++++++------ src/device/intrinsics/memory_shared.jl | 29 +++-- src/execution.jl | 38 +------ src/pointer.jl | 148 +++++++++++++++++++++++++ test/array.jl | 57 +++++++--- test/codegen.jl | 19 ++++ test/execution.jl | 22 ++-- test/pointer.jl | 59 ++++++++++ test/runtests.jl | 1 + 12 files changed, 398 insertions(+), 106 deletions(-) rename src/{device/util.jl => cgutils.jl} (82%) create mode 100644 src/pointer.jl create mode 100644 test/pointer.jl diff --git a/REQUIRE b/REQUIRE index cb8420de..cb9f283f 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,3 +1,3 @@ julia 0.6 -CUDAdrv 0.4.2 -LLVM 0.3.6 +CUDAdrv 0.5.0 +LLVM 0.3.8 diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl index 2c0bfe17..21d7d4be 100644 --- a/src/CUDAnative.jl +++ b/src/CUDAnative.jl @@ -16,17 +16,22 @@ else false end +include("cgutils.jl") +include("pointer.jl") + +# needs to be loaded _before_ the compiler infrastructure, because of generated functions +include(joinpath("device", "array.jl")) +include(joinpath("device", "intrinsics.jl")) +include(joinpath("device", "libdevice.jl")) + include("jit.jl") include("profile.jl") -include(joinpath("device", "util.jl")) -include(joinpath("device", "array.jl")) -include(joinpath("device", "intrinsics.jl")) # some of these files contain generated functions, -include(joinpath("device", "libdevice.jl")) # so should get loaded late (JuliaLang/julia#19942) include("execution.jl") include("reflection.jl") const default_device = Ref{CuDevice}() const default_context = Ref{CuContext}() +const jlctx = Ref{LLVM.Context}() function __init__() if !configured warn("CUDAnative.jl has not been configured, and will not work properly.") @@ -49,6 +54,8 @@ function __init__() default_device[] = CuDevice(0) default_context[] = CuContext(default_device[]) + jlctx[] = LLVM.Context(cglobal(:jl_LLVMContext, Void)) + init_jit() end diff --git a/src/device/util.jl b/src/cgutils.jl similarity index 82% rename from src/device/util.jl rename to src/cgutils.jl index 55920194..f4a8f33d 100644 --- a/src/device/util.jl +++ b/src/cgutils.jl @@ -1,4 +1,4 @@ -# Utility functions for implementing intrinsics and other device code +# Code generation utility functions # how to map primitive Julia types to LLVM data types const llvmtypes = Dict{Type,Symbol}( @@ -146,3 +146,32 @@ Base.@pure function datatype_align(::Type{T}) where {T} field = T.layout + sizeof(UInt32) unsafe_load(convert(Ptr{UInt16}, field)) & convert(Int16, 2^9-1) end + + +# create an LLVM function, given its return (LLVM) type and a vector of argument types +function create_llvmf(ret::LLVMType, params::Vector{LLVMType}, name::String="")::LLVM.Function + mod = LLVM.Module("llvmcall", jlctx[]) + + llvmf_typ = LLVM.FunctionType(ret, params) + llvmf = LLVM.Function(mod, name, llvmf_typ) + push!(function_attributes(llvmf), EnumAttribute("alwaysinline")) + + return llvmf +end + +# call an LLVM function, given its return (Julia) type, a tuple-type for the arguments, +# and an expression yielding a tuple of the actual argument values. +function call_llvmf(llvmf::LLVM.Function, ret::Type, params::Type, args::Expr) + quote + Base.@_inline_meta + Base.llvmcall(LLVM.ref($llvmf), $ret, $params, $args...) + end +end + +function Base.convert(::Type{LLVMType}, typ::Type) + isboxed_ref = Ref{Bool}() + llvmtyp = LLVMType(ccall(:julia_type_to_llvm, LLVM.API.LLVMTypeRef, + (Any, Ptr{Bool}), typ, isboxed_ref)) + @assert !isboxed_ref[] + return llvmtyp +end diff --git a/src/device/array.jl b/src/device/array.jl index eace069d..33b29082 100644 --- a/src/device/array.jl +++ b/src/device/array.jl @@ -9,7 +9,8 @@ export """ CuDeviceArray(dims, ptr) CuDeviceArray{T}(dims, ptr) - CuDeviceArray{T,N}(dims, ptr) + CuDeviceArray{T,A}(dims, ptr) + CuDeviceArray{T,A,N}(dims, ptr) Construct an `N`-dimensional dense CUDA device array with element type `T` wrapping a pointer, where `N` is determined from the length of `dims` and `T` is determined from the @@ -23,62 +24,69 @@ CuDeviceArray # NOTE: we can't support the typical `tuple or series of integer` style construction, # because we're currently requiring a trailing pointer argument. -struct CuDeviceArray{T,N} <: AbstractArray{T,N} +struct CuDeviceArray{T,N,A} <: AbstractArray{T,N} shape::NTuple{N,Int} - ptr::Ptr{T} + ptr::DevicePtr{T,A} - # inner constructors (exact types, ie. Int not <:Integer) - CuDeviceArray{T,N}(shape::NTuple{N,Int}, ptr::Ptr{T}) where {T,N} = new(shape, ptr) + # inner constructors, fully parameterized, exact types (ie. Int not <:Integer) + CuDeviceArray{T,N,A}(shape::NTuple{N,Int}, ptr::DevicePtr{T,A}) where {T,A,N} = new(shape,ptr) end -const CuDeviceVector = CuDeviceArray{T,1} where {T} -const CuDeviceMatrix = CuDeviceArray{T,2} where {T} +const CuDeviceVector = CuDeviceArray{T,1,A} where {T,A} +const CuDeviceMatrix = CuDeviceArray{T,2,A} where {T,A} # outer constructors, non-parameterized -CuDeviceArray(dims::NTuple{N,<:Integer}, p::Ptr{T}) where {T,N} = CuDeviceArray{T,N}(dims, p) -CuDeviceArray(len::Integer, p::Ptr{T}) where {T} = CuDeviceVector{T}((len,), p) +CuDeviceArray(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p) +CuDeviceArray(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((len,), p) # outer constructors, partially parameterized -(::Type{CuDeviceArray{T}})(dims::NTuple{N,<:Integer}, p::Ptr{T}) where {T,N} = CuDeviceArray{T,N}(dims, p) -(::Type{CuDeviceArray{T}})(len::Integer, p::Ptr{T}) where {T} = CuDeviceVector{T}((len,), p) +(::Type{CuDeviceArray{T}})(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p) +(::Type{CuDeviceArray{T}})(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((len,), p) +(::Type{CuDeviceArray{T,N}})(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p) +(::Type{CuDeviceVector{T}})(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((len,), p) # outer constructors, fully parameterized -(::Type{CuDeviceArray{T,N}})(dims::NTuple{N,<:Integer}, p::Ptr{T}) where {T,N} = CuDeviceArray{T,N}(Int.(dims), p) -(::Type{CuDeviceVector{T}})(len::Integer, p::Ptr{T}) where {T} = CuDeviceVector{T}((Int(len),), p) +(::Type{CuDeviceArray{T,N,A}})(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(Int.(dims), p) +(::Type{CuDeviceVector{T,A}})(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((Int(len),), p) -Base.convert(::Type{CuDeviceArray{T,N}}, a::CuArray{T,N}) where {T,N} = - CuDeviceArray{T,N}(a.shape, Base.unsafe_convert(Ptr{T}, a.devptr)) -Base.unsafe_convert(::Type{Ptr{T}}, a::CuDeviceArray{T}) where {T} = a.ptr::Ptr{T} +## getters - -## array interface +Base.pointer(a::CuDeviceArray) = a.ptr Base.size(g::CuDeviceArray) = g.shape Base.length(g::CuDeviceArray) = prod(g.shape) -@inline function Base.getindex(A::CuDeviceArray{T}, index::Int) where {T} + +## conversions + +Base.unsafe_convert(::Type{DevicePtr{T,A}}, a::CuDeviceArray{T,N,A}) where {T,A,N} = pointer(a) + +# from CuArray +function Base.convert(::Type{CuDeviceArray{T,N,AS.Global}}, a::CuArray{T,N}) where {T,N} + owned_ptr = pointer(a) + ptr = Base.unsafe_convert(Ptr{T}, owned_ptr) + CuDeviceArray{T,N,AS.Global}(a.shape, DevicePtr{T,AS.Global}(ptr)) +end +cudaconvert(::Type{CuArray{T,N}}) where {T,N} = CuDeviceArray{T,N,AS.Global} + + +## indexing + +@inline function Base.getindex(A::CuDeviceArray{T}, index::Integer) where {T} @boundscheck checkbounds(A, index) align = datatype_align(T) - Base.pointerref(Base.unsafe_convert(Ptr{T}, A), index, align)::T + Base.unsafe_load(pointer(A), index, Val{align})::T end -@inline function Base.setindex!(A::CuDeviceArray{T}, x, index::Int) where {T} +@inline function Base.setindex!(A::CuDeviceArray{T}, x, index::Integer) where {T} @boundscheck checkbounds(A, index) align = datatype_align(T) - Base.pointerset(Base.unsafe_convert(Ptr{T}, A), convert(T, x)::T, index, align) + Base.unsafe_store!(pointer(A), x, index, Val{align}) end Base.IndexStyle(::Type{<:CuDeviceArray}) = Base.IndexLinear() -Base.show(io::IO, a::CuDeviceVector{T}) where {T} = - print(io, "$(length(a))-element device array at $(pointer(a))") -Base.show(io::IO, a::CuDeviceArray{T,N}) where {T,N} = - print(io, "$(join(a.shape, '×')) device array at $(pointer(a))") - - -## quirks - # bounds checking is currently broken due to a PTX assembler issue (see #4) Base.checkbounds(::CuDeviceArray, I...) = nothing @@ -88,10 +96,19 @@ struct CuBoundsError <: Exception end @inline Base.throw_boundserror(A::CuDeviceArray, I) = (Base.@_noinline_meta; throw(CuBoundsError())) -# idem + +## other + +Base.show(io::IO, a::CuDeviceVector) = + print(io, "$(length(a))-element device array at $(pointer(a))") +Base.show(io::IO, a::CuDeviceArray) = + print(io, "$(join(a.shape, '×')) device array at $(pointer(a))") + +Base.show(io::IO, mime::MIME"text/plain", a::CuDeviceArray) = show(io, a) + function Base.unsafe_view(A::CuDeviceVector{T}, I::Vararg{Base.ViewIndex,1}) where {T} Base.@_inline_meta - ptr = Base.unsafe_convert(Ptr{T}, A) + (I[1].start-1)*sizeof(T) + ptr = pointer(A) + (I[1].start-1)*sizeof(T) len = I[1].stop - I[1].start + 1 return CuDeviceArray(len, ptr) end diff --git a/src/device/intrinsics/memory_shared.jl b/src/device/intrinsics/memory_shared.jl index 3248a34b..2a2703bc 100644 --- a/src/device/intrinsics/memory_shared.jl +++ b/src/device/intrinsics/memory_shared.jl @@ -13,13 +13,16 @@ shmem_id = 0 function emit_shmem(id, llvmtyp, len, align) var = Symbol("@shmem", id) jltyp = jltypes[llvmtyp] + + @gensym ptr quote - Base.llvmcall( + $ptr = Base.llvmcall( ($"""$var = external addrspace(3) global [$len x $llvmtyp], align $align""", $"""%1 = getelementptr inbounds [$len x $llvmtyp], [$len x $llvmtyp] addrspace(3)* $var, i64 0, i64 0 %2 = addrspacecast $llvmtyp addrspace(3)* %1 to $llvmtyp addrspace(0)* ret $llvmtyp* %2"""), Ptr{$jltyp}, Tuple{}) + DevicePtr{$jltyp,AS.Shared}($ptr) end end @@ -28,7 +31,7 @@ end # to `@cuda` """ - @cuStaticSharedMem(typ::Type, dims) -> CuDeviceArray{typ} + @cuStaticSharedMem(typ::Type, dims) -> CuDeviceArray{typ,Shared} Get an array of type `typ` and dimensions `dims` (either an integer length or tuple shape) pointing to a statically-allocated piece of shared memory. The type should be statically @@ -53,10 +56,11 @@ function emit_static_shmem{N, T<:LLVMTypes}(id::Integer, jltyp::Type{T}, shape:: len = prod(shape) align = datatype_align(jltyp) + @gensym ptr return quote Base.@_inline_meta - ptr = $(emit_shmem(id, llvmtyp, len, align)) - CuDeviceArray{$jltyp}($shape, ptr) + $ptr = $(emit_shmem(id, llvmtyp, len, align)) + CuDeviceArray($shape, $ptr) end end @@ -69,10 +73,11 @@ function emit_static_shmem{N}(id::Integer, jltyp::Type, shape::NTuple{N,<:Intege len = prod(shape) * sizeof(jltyp) align = datatype_align(jltyp) + @gensym ptr return quote Base.@_inline_meta - ptr = $(emit_shmem(id, :i8, len, align)) - CuDeviceArray{$jltyp}($shape, Base.unsafe_convert(Ptr{$jltyp}, ptr)) + $ptr = $(emit_shmem(id, :i8, len, align)) + CuDeviceArray($shape, Base.convert(DevicePtr{$jltyp}, $ptr)) end end @@ -82,7 +87,7 @@ end """ - @cuDynamicSharedMem(typ::Type, dims, offset::Integer=0) -> CuDeviceArray{typ} + @cuDynamicSharedMem(typ::Type, dims, offset::Integer=0) -> CuDeviceArray{typ,Shared} Get an array of type `typ` and dimensions `dims` (either an integer length or tuple shape) pointing to a dynamically-allocated piece of shared memory. The type should be statically @@ -114,10 +119,11 @@ function emit_dynamic_shmem{T<:LLVMTypes}(id::Integer, jltyp::Type{T}, shape::Un align = datatype_align(jltyp) + @gensym ptr return quote Base.@_inline_meta - ptr = $(emit_shmem(id, llvmtyp, 0, align)) + $offset - CuDeviceArray{$jltyp}($shape, ptr) + $ptr = $(emit_shmem(id, llvmtyp, 0, align)) + $offset + CuDeviceArray($shape, $ptr) end end @@ -129,10 +135,11 @@ function emit_dynamic_shmem(id::Integer, jltyp::Type, shape::Union{Expr,Symbol}, align = datatype_align(jltyp) + @gensym ptr return quote Base.@_inline_meta - ptr = $(emit_shmem(id, :i8, 0, align)) + $offset - CuDeviceArray{$jltyp}($shape, Base.unsafe_convert(Ptr{$jltyp}, ptr)) + $ptr = $(emit_shmem(id, :i8, 0, align)) + $offset + CuDeviceArray($shape, Base.convert(DevicePtr{$jltyp}, $ptr)) end end diff --git a/src/execution.jl b/src/execution.jl index b97de19f..3eb80673 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -1,6 +1,6 @@ # Native execution support -export @cuda, nearest_warpsize +export @cuda, nearest_warpsize, cudaconvert using Base.Iterators: filter @@ -9,32 +9,8 @@ using Base.Iterators: filter # Auxiliary # -# Determine which type to pre-convert objects to for use on a CUDA device. -# -# The resulting object type will be used as a starting point to determine the final argument -# types. This is different from `cconvert` in that we don't know which type to convert to. -function convert_type(t) - # NOTE: this conversion was originally intended to be a user-extensible interface, - # a la cconvert (look for cudaconvert in f1e592e61d6898869b918331e3e625292f4c8cab). - # - # however, the generated function behind @cuda isn't allowed to call overloaded - # functions (only pure ones), and also won't be able to see functions defined - # after the generated function's body (see JuliaLang/julia#19942). - - # Pointer handling - if t <: DevicePtr - return Ptr{t.parameters...} - elseif t <: Ptr - throw(InexactError()) - end - - # Array types - if t <: CuArray - return CuDeviceArray{t.parameters...} - end - - return t -end +# NOTE: this method cannot be extended, because it is used in a generated function +cudaconvert(::Type{T}) where {T} = T # Convert the arguments to a kernel function to their CUDA representation, and figure out # what types to specialize the kernel function for. @@ -45,7 +21,7 @@ function convert_arguments(args, types) # convert types to their CUDA representation for i in 1:length(argexprs) t = argtypes[i] - ct = convert_type(t) + ct = cudaconvert(t) if ct != t argtypes[i] = ct if ct <: Ptr @@ -56,8 +32,6 @@ function convert_arguments(args, types) end end - # NOTE: DevicePtr's should have disappeared after this point - for argtype in argtypes if argtype.layout == C_NULL || !Base.datatype_pointerfree(argtype) error("don't know how to handle argument of type $argtype") @@ -108,8 +82,8 @@ the launch should be scheduled. The `func` argument should be a valid Julia function. It will be compiled to a CUDA function upon first use, and to a certain extent arguments will be converted and managed -automatically. Finally, a call to `cudacall` is performed, scheduling the compiled function -for execution on the GPU. +automatically (see [`cudaconvert`](@ref)). Finally, a call to `cudacall` is performed, +scheduling the compiled function for execution on the GPU. """ macro cuda(config::Expr, callexpr::Expr) # sanity checks diff --git a/src/pointer.jl b/src/pointer.jl new file mode 100644 index 00000000..938a64b7 --- /dev/null +++ b/src/pointer.jl @@ -0,0 +1,148 @@ +# Device pointer with address space information + +# +# Address spaces +# + +export + AS, addrspace + +abstract type AddressSpace end + +module AS + +using CUDAnative +import CUDAnative: AddressSpace + +struct Generic <: AddressSpace end +struct Global <: AddressSpace end +struct Shared <: AddressSpace end +struct Constant <: AddressSpace end +struct Local <: AddressSpace end + +end + + +# +# Device pointer +# + +struct DevicePtr{T,A} + ptr::Ptr{T} + + # inner constructors, fully parameterized + DevicePtr{T,A}(ptr::Ptr{T}) where {T,A<:AddressSpace} = new(ptr) +end + +# outer constructors, partially parameterized +(::Type{DevicePtr{T}})(ptr::Ptr{T}) where {T} = DevicePtr{T,AS.Generic}(ptr) + +# outer constructors, non-parameterized +DevicePtr(ptr::Ptr{T}) where {T} = DevicePtr{T,AS.Generic}(ptr) + + +## getters + +Base.pointer(p::DevicePtr) = p.ptr + +Base.isnull(p::DevicePtr) = (pointer(p) == C_NULL) +Base.eltype(::Type{<:DevicePtr{T}}) where {T} = T + +addrspace(x) = addrspace(typeof(x)) +addrspace(::Type{DevicePtr{T,A}}) where {T,A} = A + + +## conversions + +# between regular and device pointers +## simple conversions disallowed +Base.convert(::Type{Ptr{T}}, p::DevicePtr{T}) where {T} = throw(InexactError()) +Base.convert(::Type{<:DevicePtr{T}}, p::Ptr{T}) where {T} = throw(InexactError()) +## unsafe ones are allowed +Base.unsafe_convert(::Type{Ptr{T}}, p::DevicePtr{T}) where {T} = pointer(p) + +# defer conversions to DevicePtr to unsafe_convert +Base.cconvert(::Type{<:DevicePtr}, x) = x + +# between device pointers +Base.convert(::Type{<:DevicePtr}, p::DevicePtr) = throw(InexactError()) +Base.convert(::Type{DevicePtr{T,A}}, p::DevicePtr{T,A}) where {T,A} = p +Base.unsafe_convert(::Type{DevicePtr{T,A}}, p::DevicePtr) where {T,A} = DevicePtr{T,A}(reinterpret(Ptr{T}, pointer(p))) +## identical addrspaces +Base.convert(::Type{DevicePtr{T,A}}, p::DevicePtr{U,A}) where {T,U,A} = Base.unsafe_convert(DevicePtr{T,A}, p) +## convert to & from generic +Base.convert(::Type{DevicePtr{T,AS.Generic}}, p::DevicePtr) where {T} = Base.unsafe_convert(DevicePtr{T,AS.Generic}, p) +Base.convert(::Type{DevicePtr{T,A}}, p::DevicePtr{U,AS.Generic}) where {T,U,A} = Base.unsafe_convert(DevicePtr{T,A}, p) +Base.convert(::Type{DevicePtr{T,AS.Generic}}, p::DevicePtr{T,AS.Generic}) where {T} = p # avoid ambiguities +## unspecified, preserve source addrspace +Base.convert(::Type{DevicePtr{T}}, p::DevicePtr{U,A}) where {T,U,A} = Base.unsafe_convert(DevicePtr{T,A}, p) + + +## limited pointer arithmetic & comparison + +Base.:(==)(a::DevicePtr, b::DevicePtr) = pointer(a) == pointer(b) && addrspace(a) == addrspace(b) + +Base.isless(x::DevicePtr, y::DevicePtr) = Base.isless(pointer(x), pointer(y)) +Base.:(-)(x::DevicePtr, y::DevicePtr) = pointer(x) - pointer(y) + +Base.:(+)(x::DevicePtr{T,A}, y::Integer) where {T,A} = DevicePtr{T,A}(pointer(x) + y) +Base.:(-)(x::DevicePtr{T,A}, y::Integer) where {T,A} = DevicePtr{T,A}(pointer(x) - y) +Base.:(+)(x::Integer, y::DevicePtr) = y + x + + +## memory operations + +Base.convert(::Type{Int}, ::Type{AS.Generic}) = 0 +Base.convert(::Type{Int}, ::Type{AS.Global}) = 1 +Base.convert(::Type{Int}, ::Type{AS.Shared}) = 3 +Base.convert(::Type{Int}, ::Type{AS.Constant}) = 4 +Base.convert(::Type{Int}, ::Type{AS.Local}) = 5 + +@generated function Base.unsafe_load(p::DevicePtr{T,A}, i::Integer=1, + ::Type{Val{align}}=Val{1}) where {T,A,align} + eltyp = convert(LLVMType, T) + + # create a function + param_types = [LLVM.PointerType(eltyp), + LLVM.IntType(sizeof(Int)*8, jlctx[])] + llvmf = create_llvmf(eltyp, param_types) + + # generate IR + Builder(jlctx[]) do builder + entry = BasicBlock(llvmf, "entry", jlctx[]) + position!(builder, entry) + + ptr = gep!(builder, parameters(llvmf)[1], [parameters(llvmf)[2]]) + ptr_with_as = addrspacecast!(builder, ptr, LLVM.PointerType(eltyp, convert(Int, A))) + val = load!(builder, ptr_with_as) + alignment!(val, align) + ret!(builder, val) + end + + call_llvmf(llvmf, T, Tuple{Ptr{T}, Int}, :((pointer(p), Int(i-1)))) +end + +@generated function Base.unsafe_store!(p::DevicePtr{T,A}, x, i::Integer=1, + ::Type{Val{align}}=Val{1}) where {T,A,align} + eltyp = convert(LLVMType, T) + + # create a function + param_types = [LLVM.PointerType(eltyp), eltyp, + LLVM.IntType(sizeof(Int)*8, jlctx[])] + llvmf = create_llvmf(LLVM.VoidType(jlctx[]), param_types) + + # generate IR + Builder(jlctx[]) do builder + entry = BasicBlock(llvmf, "entry", jlctx[]) + position!(builder, entry) + + ptr = gep!(builder, parameters(llvmf)[1], [parameters(llvmf)[3]]) + ptr_with_as = addrspacecast!(builder, ptr, LLVM.PointerType(eltyp, convert(Int, A))) + val = parameters(llvmf)[2] + inst = store!(builder, val, ptr_with_as) + alignment!(inst, align) + ret!(builder) + end + + call_llvmf(llvmf, Void, Tuple{Ptr{T}, T, Int}, :((pointer(p), convert(T,x), Int(i-1)))) +end diff --git a/test/array.jl b/test/array.jl index 93a33056..a2ed806e 100644 --- a/test/array.jl +++ b/test/array.jl @@ -6,36 +6,46 @@ # inner constructors let p = Ptr{Int}(C_NULL) - @on_device CuDeviceArray{Int,1}((1,), $p) + dp = CUDAnative.DevicePtr(p) + CuDeviceArray{Int,1,AS.Generic}((1,), dp) end # outer constructors for I in [Int32,Int64] a = I(1) b = I(2) + p = Ptr{I}(C_NULL) + dp = CUDAnative.DevicePtr(p) # not parameterized - @on_device CuDeviceArray($b, $p) - @on_device CuDeviceArray(($b,), $p) - @on_device CuDeviceArray(($b,$a), $p) + CuDeviceArray(b, dp) + CuDeviceArray((b,), dp) + CuDeviceArray((b,a), dp) # partially parameterized - @on_device CuDeviceArray{$I}($b, $p) - @on_device CuDeviceArray{$I}(($b,), $p) - @on_device CuDeviceArray{$I}(($a,$b), $p) + CuDeviceArray{I}(b, dp) + CuDeviceArray{I}((b,), dp) + CuDeviceArray{I}((a,b), dp) + CuDeviceArray{I,1}(b, dp) + CuDeviceArray{I,1}((b,), dp) + @test_throws MethodError CuDeviceArray{I,1}((a,b), dp) + @test_throws MethodError CuDeviceArray{I,2}(b, dp) + @test_throws MethodError CuDeviceArray{I,2}((b,), dp) + CuDeviceArray{I,2}((a,b), dp) # fully parameterized - @on_device CuDeviceArray{$I,1}($b, $p) - @on_device CuDeviceArray{$I,1}(($b,), $p) - @test_throws ErrorException @on_device CuDeviceArray{$I,1}(($a,$b), $p) - @test_throws ErrorException @on_device CuDeviceArray{$I,2}($b, $p) - @test_throws ErrorException @on_device CuDeviceArray{$I,2}(($b,), $p) - @on_device CuDeviceArray{$I,2}(($a,$b), $p) + CuDeviceArray{I,1,AS.Generic}(b, dp) + CuDeviceArray{I,1,AS.Generic}((b,), dp) + @test_throws MethodError CuDeviceArray{I,1,AS.Generic}((a,b), dp) + @test_throws MethodError CuDeviceArray{I,1,AS.Shared}((a,b), dp) + @test_throws MethodError CuDeviceArray{I,2,AS.Generic}(b, dp) + @test_throws MethodError CuDeviceArray{I,2,AS.Generic}((b,), dp) + CuDeviceArray{I,2,AS.Generic}((a,b), dp) # type aliases - @on_device CuDeviceVector{$I}($b, $p) - @on_device CuDeviceMatrix{$I}(($a,$b), $p) + CuDeviceVector{I}(b, dp) + CuDeviceMatrix{I}((a,b), dp) end end @@ -78,14 +88,14 @@ end # NOTE: these tests verify that bounds checking is _disabled_ (see #4) - ir = sprint(io->CUDAnative.code_llvm(io, array_oob_1d, (CuDeviceArray{Int,1},))) + ir = sprint(io->CUDAnative.code_llvm(io, array_oob_1d, (CuDeviceArray{Int,1,AS.Global},))) @test !contains(ir, "trap") @eval function array_oob_2d(array) return array[1, 1] end - ir = sprint(io->CUDAnative.code_llvm(io, array_oob_2d, (CuDeviceArray{Int,2},))) + ir = sprint(io->CUDAnative.code_llvm(io, array_oob_2d, (CuDeviceArray{Int,2,AS.Global},))) @test !contains(ir, "trap") end @@ -119,4 +129,17 @@ end ############################################################################################ + +@testset "bug: non-Int index to unsafe_load" begin + @eval function array_load_index(a) + return a[UInt64(1)] + end + + a = [1] + p = pointer(a) + dp = CUDAnative.DevicePtr(p) + da = CUDAnative.CuDeviceArray(1, dp) + array_load_index(da) +end + end diff --git a/test/codegen.jl b/test/codegen.jl index ff274515..0b3cc3a7 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -81,6 +81,25 @@ end end end +if Base.VERSION >= v"0.6.1-pre.1" + # JuliaLang/julia#22022 is required for AS-specific operations to work + # on certain structs, which this test verifies. + # + # Keep this test disabled until there's at least been one commit + # on the release-0.6 branch, which we assume to include #22022. + + @testset "LLVM D32593" begin + @eval struct llvm_D32593_struct + foo::Float32 + bar::Float32 + end + + @eval llvm_D32593(arr) = arr[1].foo + + CUDAnative.code_llvm(DevNull, llvm_D32593, Tuple{CuDeviceVector{llvm_D32593_struct,AS.Global}}) + end +end + end diff --git a/test/execution.jl b/test/execution.jl index 000c7a64..27d33bd6 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -137,7 +137,8 @@ len = prod(dims) input_dev = CuArray(input) output_dev = similar(input_dev) - @cuda (1,len) exec_pass_ptr(input_dev.devptr, output_dev.devptr) + @cuda (1,len) exec_pass_ptr(Base.unsafe_convert(Ptr{Float32}, input_dev), + Base.unsafe_convert(Ptr{Float32}, output_dev)) output = Array(output_dev) @test input ≈ output end @@ -161,7 +162,8 @@ end arr_dev = CuArray(arr) val_dev = CuArray(val) - @cuda (1,len) exec_pass_scalar(arr_dev.devptr, val_dev.devptr) + @cuda (1,len) exec_pass_scalar(Base.unsafe_convert(Ptr{Float32}, arr_dev), + Base.unsafe_convert(Ptr{Float32}, val_dev)) @test arr[dims...] ≈ Array(val_dev)[1] end @@ -187,7 +189,8 @@ end arr_dev = CuArray(arr) val_dev = CuArray(val) - @cuda (1,len) exec_pass_scalar_devfun(arr_dev.devptr, val_dev.devptr) + @cuda (1,len) exec_pass_scalar_devfun(Base.unsafe_convert(Ptr{Float32}, arr_dev), + Base.unsafe_convert(Ptr{Float32}, val_dev)) @test arr[dims...] ≈ Array(val_dev)[1] end @@ -207,7 +210,7 @@ end keeps = (true,) d_out = CuArray{Int}(1) - @cuda (1,1) exec_pass_tuples(keeps, d_out.devptr) + @cuda (1,1) exec_pass_tuples(keeps, Base.unsafe_convert(Ptr{Int}, d_out)) @test Array(d_out) == [1] end @@ -231,7 +234,10 @@ end return nothing end - @cuda (1,len) exec_pass_ghost(ExecGhost(), d_a.devptr, d_b.devptr, d_c.devptr) + @cuda (1,len) exec_pass_ghost(ExecGhost(), + Base.unsafe_convert(Ptr{Float32}, d_a), + Base.unsafe_convert(Ptr{Float32}, d_b), + Base.unsafe_convert(Ptr{Float32}, d_c)) c = Array(d_c) @test a+b == c @@ -245,7 +251,9 @@ end return nothing end - @cuda (1,len) exec_pass_ghost_aggregate(ExecGhost(), d_c.devptr, (42,)) + @cuda (1,len) exec_pass_ghost_aggregate(ExecGhost(), + Base.unsafe_convert(Ptr{Float32}, d_c), + (42,)) c = Array(d_c) @test all(val->val==42, c) @@ -263,7 +271,7 @@ end A = CuArray(zeros(Float32, (1,))) x = Complex64(2,2) - @cuda (1, 1) exec_pass_immutables(A.devptr, x) + @cuda (1, 1) exec_pass_immutables(Base.unsafe_convert(Ptr{Float32}, A), x) @test Array(A) == Float32[imag(x)] end diff --git a/test/pointer.jl b/test/pointer.jl new file mode 100644 index 00000000..515d3c67 --- /dev/null +++ b/test/pointer.jl @@ -0,0 +1,59 @@ +@testset "pointer" begin + +# inner constructors + +const generic_null = CUDAnative.DevicePtr{Void,AS.Generic}(C_NULL) +const global_null = CUDAnative.DevicePtr{Void,AS.Global}(C_NULL) +const local_null = CUDAnative.DevicePtr{Void,AS.Local}(C_NULL) + +const C_NONNULL = Ptr{Void}(1) +const generic_nonnull = CUDAnative.DevicePtr{Void,AS.Generic}(C_NONNULL) +const global_nonnull = CUDAnative.DevicePtr{Void,AS.Global}(C_NONNULL) +const local_nonnull = CUDAnative.DevicePtr{Void,AS.Local}(C_NONNULL) + +const C_ONE = Ptr{Int}(1) +const generic_one = CUDAnative.DevicePtr{Int,AS.Generic}(C_ONE) +const global_one = CUDAnative.DevicePtr{Int,AS.Global}(C_ONE) +const local_one = CUDAnative.DevicePtr{Int,AS.Local}(C_ONE) + +# outer constructors +@test CUDAnative.DevicePtr{Void}(C_NULL) == generic_null +@test CUDAnative.DevicePtr(C_NULL) == generic_null + +# getters +@test eltype(generic_null) == Void +@test addrspace(generic_null) == AS.Generic +@test isnull(generic_null) +@test !isnull(generic_nonnull) + +# comparisons +@test generic_null != generic_one +@test generic_null != global_null +@test local_null != global_null + + +@testset "conversions" begin + +# between regular and device pointers + +@test_throws InexactError convert(Ptr{Void}, generic_null) +@test_throws InexactError convert(CUDAnative.DevicePtr{Void}, C_NULL) + +@test Base.unsafe_convert(Ptr{Void}, generic_null) == C_NULL + + +# between device pointers + +@test_throws InexactError convert(typeof(local_null), global_null) == local_null +@test convert(typeof(generic_null), generic_null) == generic_null +@test convert(typeof(global_null), global_null) == global_null +@test Base.unsafe_convert(typeof(local_null), global_null) == local_null + +@test convert(typeof(global_null), global_one) == global_nonnull +@test convert(typeof(generic_null), global_one) == generic_nonnull +@test convert(typeof(global_null), generic_one) == global_nonnull +@test convert(CUDAnative.DevicePtr{Void}, global_one) == global_nonnull + +end + +end diff --git a/test/runtests.jl b/test/runtests.jl index 0487897d..c0556c2a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,6 +6,7 @@ using Base.Test include("util.jl") include("base.jl") +include("pointer.jl") if CUDAnative.configured # requiring a configured LLVM.jl