Introduce DevicePtr to track address space of pointers.

Optimize loads and stores using LLVM.jl.
JuliaGPU · Aug 1, 2017 · 654fdd8 · tkelman · Aug 2, 2017 · maleadt
1 parent 7120e90
commit 654fdd8
Show file tree

Hide file tree

Showing 12 changed files with 398 additions and 106 deletions.
diff --git a/REQUIRE b/REQUIRE
@@ -1,3 +1,3 @@
 julia 0.6
-CUDAdrv 0.4.2
-LLVM 0.3.6
+CUDAdrv 0.5.0
+LLVM 0.3.8
diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl
@@ -16,17 +16,22 @@ else
     false
 end
 
+include("cgutils.jl")
+include("pointer.jl")
+
+# needs to be loaded _before_ the compiler infrastructure, because of generated functions
+include(joinpath("device", "array.jl"))
+include(joinpath("device", "intrinsics.jl"))
+include(joinpath("device", "libdevice.jl"))
+
 include("jit.jl")
 include("profile.jl")
-include(joinpath("device", "util.jl"))
-include(joinpath("device", "array.jl"))
-include(joinpath("device", "intrinsics.jl")) # some of these files contain generated functions,
-include(joinpath("device", "libdevice.jl"))  # so should get loaded late (JuliaLang/julia#19942)
 include("execution.jl")
 include("reflection.jl")
 
 const default_device = Ref{CuDevice}()
 const default_context = Ref{CuContext}()
+const jlctx = Ref{LLVM.Context}()
 function __init__()
     if !configured
         warn("CUDAnative.jl has not been configured, and will not work properly.")
@@ -49,6 +54,8 @@ function __init__()
     default_device[] = CuDevice(0)
     default_context[] = CuContext(default_device[])
 
+    jlctx[] = LLVM.Context(cglobal(:jl_LLVMContext, Void))
+
     init_jit()
 end
 

diff --git a/src/device/util.jl → src/cgutils.jl b/src/device/util.jl → src/cgutils.jl
@@ -1,4 +1,4 @@
-# Utility functions for implementing intrinsics and other device code
+# Code generation utility functions
 
 # how to map primitive Julia types to LLVM data types
 const llvmtypes = Dict{Type,Symbol}(
@@ -146,3 +146,32 @@ Base.@pure function datatype_align(::Type{T}) where {T}
     field = T.layout + sizeof(UInt32)
     unsafe_load(convert(Ptr{UInt16}, field)) & convert(Int16, 2^9-1)
 end
+
+
+# create an LLVM function, given its return (LLVM) type and a vector of argument types
+function create_llvmf(ret::LLVMType, params::Vector{LLVMType}, name::String="")::LLVM.Function
+    mod = LLVM.Module("llvmcall", jlctx[])
+
+    llvmf_typ = LLVM.FunctionType(ret, params)
+    llvmf = LLVM.Function(mod, name, llvmf_typ)
+    push!(function_attributes(llvmf), EnumAttribute("alwaysinline"))
+
+    return llvmf
+end
+
+# call an LLVM function, given its return (Julia) type, a tuple-type for the arguments,
+# and an expression yielding a tuple of the actual argument values.
+function call_llvmf(llvmf::LLVM.Function, ret::Type, params::Type, args::Expr)
+    quote
+        Base.@_inline_meta
+        Base.llvmcall(LLVM.ref($llvmf), $ret, $params, $args...)
+    end
+end
+
+function Base.convert(::Type{LLVMType}, typ::Type)
+    isboxed_ref = Ref{Bool}()
+    llvmtyp = LLVMType(ccall(:julia_type_to_llvm, LLVM.API.LLVMTypeRef,
+                             (Any, Ptr{Bool}), typ, isboxed_ref))
+    @assert !isboxed_ref[]
+    return llvmtyp
+end
diff --git a/src/device/array.jl b/src/device/array.jl
@@ -9,7 +9,8 @@ export
 """
     CuDeviceArray(dims, ptr)
     CuDeviceArray{T}(dims, ptr)
-    CuDeviceArray{T,N}(dims, ptr)
+    CuDeviceArray{T,A}(dims, ptr)
+    CuDeviceArray{T,A,N}(dims, ptr)
 
 Construct an `N`-dimensional dense CUDA device array with element type `T` wrapping a
 pointer, where `N` is determined from the length of `dims` and `T` is determined from the
@@ -23,62 +24,69 @@ CuDeviceArray
 # NOTE: we can't support the typical `tuple or series of integer` style construction,
 #       because we're currently requiring a trailing pointer argument.
 
-struct CuDeviceArray{T,N} <: AbstractArray{T,N}
+struct CuDeviceArray{T,N,A} <: AbstractArray{T,N}
     shape::NTuple{N,Int}
-    ptr::Ptr{T}
+    ptr::DevicePtr{T,A}
 
-    # inner constructors (exact types, ie. Int not <:Integer)
-    CuDeviceArray{T,N}(shape::NTuple{N,Int}, ptr::Ptr{T}) where {T,N} = new(shape, ptr)
+    # inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
+    CuDeviceArray{T,N,A}(shape::NTuple{N,Int}, ptr::DevicePtr{T,A}) where {T,A,N} = new(shape,ptr)
 end
 
-const CuDeviceVector = CuDeviceArray{T,1} where {T}
-const CuDeviceMatrix = CuDeviceArray{T,2} where {T}
+const CuDeviceVector = CuDeviceArray{T,1,A} where {T,A}
+const CuDeviceMatrix = CuDeviceArray{T,2,A} where {T,A}
 
 # outer constructors, non-parameterized
-CuDeviceArray(dims::NTuple{N,<:Integer}, p::Ptr{T})                where {T,N} = CuDeviceArray{T,N}(dims, p)
-CuDeviceArray(len::Integer, p::Ptr{T})                             where {T}   = CuDeviceVector{T}((len,), p)
+CuDeviceArray(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A})                where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
+CuDeviceArray(len::Integer,              p::DevicePtr{T,A})                where {T,A}   = CuDeviceVector{T,A}((len,), p)
 
 # outer constructors, partially parameterized
-(::Type{CuDeviceArray{T}})(dims::NTuple{N,<:Integer}, p::Ptr{T})   where {T,N} = CuDeviceArray{T,N}(dims, p)
-(::Type{CuDeviceArray{T}})(len::Integer, p::Ptr{T})                where {T}   = CuDeviceVector{T}((len,), p)
+(::Type{CuDeviceArray{T}})(dims::NTuple{N,<:Integer},   p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
+(::Type{CuDeviceArray{T}})(len::Integer,                p::DevicePtr{T,A}) where {T,A}   = CuDeviceVector{T,A}((len,), p)
+(::Type{CuDeviceArray{T,N}})(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
+(::Type{CuDeviceVector{T}})(len::Integer,               p::DevicePtr{T,A}) where {T,A}   = CuDeviceVector{T,A}((len,), p)
 
 # outer constructors, fully parameterized
-(::Type{CuDeviceArray{T,N}})(dims::NTuple{N,<:Integer}, p::Ptr{T}) where {T,N} = CuDeviceArray{T,N}(Int.(dims), p)
-(::Type{CuDeviceVector{T}})(len::Integer, p::Ptr{T})               where {T}   = CuDeviceVector{T}((Int(len),), p)
+(::Type{CuDeviceArray{T,N,A}})(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(Int.(dims), p)
+(::Type{CuDeviceVector{T,A}})(len::Integer,               p::DevicePtr{T,A}) where {T,A}   = CuDeviceVector{T,A}((Int(len),), p)
 
-Base.convert(::Type{CuDeviceArray{T,N}}, a::CuArray{T,N}) where {T,N} =
-    CuDeviceArray{T,N}(a.shape, Base.unsafe_convert(Ptr{T}, a.devptr))
 
-Base.unsafe_convert(::Type{Ptr{T}}, a::CuDeviceArray{T}) where {T} = a.ptr::Ptr{T}
+## getters
 
-
-## array interface
+Base.pointer(a::CuDeviceArray) = a.ptr
 
 Base.size(g::CuDeviceArray) = g.shape
 Base.length(g::CuDeviceArray) = prod(g.shape)
 
-@inline function Base.getindex(A::CuDeviceArray{T}, index::Int) where {T}
+
+## conversions
+
+Base.unsafe_convert(::Type{DevicePtr{T,A}}, a::CuDeviceArray{T,N,A}) where {T,A,N} = pointer(a)
+
+# from CuArray
+function Base.convert(::Type{CuDeviceArray{T,N,AS.Global}}, a::CuArray{T,N}) where {T,N}
+    owned_ptr = pointer(a)
+    ptr = Base.unsafe_convert(Ptr{T}, owned_ptr)
+    CuDeviceArray{T,N,AS.Global}(a.shape, DevicePtr{T,AS.Global}(ptr))
+end
+cudaconvert(::Type{CuArray{T,N}}) where {T,N} = CuDeviceArray{T,N,AS.Global}
+
+
+## indexing
+
+@inline function Base.getindex(A::CuDeviceArray{T}, index::Integer) where {T}
     @boundscheck checkbounds(A, index)
     align = datatype_align(T)
-    Base.pointerref(Base.unsafe_convert(Ptr{T}, A), index, align)::T
+    Base.unsafe_load(pointer(A), index, Val{align})::T
 end
 
-@inline function Base.setindex!(A::CuDeviceArray{T}, x, index::Int) where {T}
+@inline function Base.setindex!(A::CuDeviceArray{T}, x, index::Integer) where {T}
     @boundscheck checkbounds(A, index)
     align = datatype_align(T)
-    Base.pointerset(Base.unsafe_convert(Ptr{T}, A), convert(T, x)::T, index, align)
+    Base.unsafe_store!(pointer(A), x, index, Val{align})
 end
 
 Base.IndexStyle(::Type{<:CuDeviceArray}) = Base.IndexLinear()
 
-Base.show(io::IO, a::CuDeviceVector{T}) where {T} =
-    print(io, "$(length(a))-element device array at $(pointer(a))")
-Base.show(io::IO, a::CuDeviceArray{T,N}) where {T,N} =
-    print(io, "$(join(a.shape, '×')) device array at $(pointer(a))")
-
-
-## quirks
-
 # bounds checking is currently broken due to a PTX assembler issue (see #4)
 Base.checkbounds(::CuDeviceArray, I...) = nothing
 
@@ -88,10 +96,19 @@ struct CuBoundsError <: Exception end
 @inline Base.throw_boundserror(A::CuDeviceArray, I) =
     (Base.@_noinline_meta; throw(CuBoundsError()))
 
-# idem
+
+## other
+
+Base.show(io::IO, a::CuDeviceVector) =
+    print(io, "$(length(a))-element device array at $(pointer(a))")
+Base.show(io::IO, a::CuDeviceArray) =
+    print(io, "$(join(a.shape, '×')) device array at $(pointer(a))")
+
+Base.show(io::IO, mime::MIME"text/plain", a::CuDeviceArray) = show(io, a)
+
 function Base.unsafe_view(A::CuDeviceVector{T}, I::Vararg{Base.ViewIndex,1}) where {T}
     Base.@_inline_meta
-    ptr = Base.unsafe_convert(Ptr{T}, A) + (I[1].start-1)*sizeof(T)
+    ptr = pointer(A) + (I[1].start-1)*sizeof(T)
     len = I[1].stop - I[1].start + 1
     return CuDeviceArray(len, ptr)
 end
diff --git a/src/device/intrinsics/memory_shared.jl b/src/device/intrinsics/memory_shared.jl
@@ -13,13 +13,16 @@ shmem_id = 0
 function emit_shmem(id, llvmtyp, len, align)
     var = Symbol("@shmem", id)
     jltyp = jltypes[llvmtyp]
+
+    @gensym ptr
     quote
-        Base.llvmcall(
+        $ptr = Base.llvmcall(
             ($"""$var = external addrspace(3) global [$len x $llvmtyp], align $align""",
              $"""%1 = getelementptr inbounds [$len x $llvmtyp], [$len x $llvmtyp] addrspace(3)* $var, i64 0, i64 0
                  %2 = addrspacecast $llvmtyp addrspace(3)* %1 to $llvmtyp addrspace(0)*
                  ret $llvmtyp* %2"""),
             Ptr{$jltyp}, Tuple{})
+        DevicePtr{$jltyp,AS.Shared}($ptr)
     end
 end
 
@@ -28,7 +31,7 @@ end
 #       to `@cuda`
 
 """
-    @cuStaticSharedMem(typ::Type, dims) -> CuDeviceArray{typ}
+    @cuStaticSharedMem(typ::Type, dims) -> CuDeviceArray{typ,Shared}
 
 Get an array of type `typ` and dimensions `dims` (either an integer length or tuple shape)
 pointing to a statically-allocated piece of shared memory. The type should be statically
@@ -53,10 +56,11 @@ function emit_static_shmem{N, T<:LLVMTypes}(id::Integer, jltyp::Type{T}, shape::
     len = prod(shape)
     align = datatype_align(jltyp)
 
+    @gensym ptr
     return quote
         Base.@_inline_meta
-        ptr = $(emit_shmem(id, llvmtyp, len, align))
-        CuDeviceArray{$jltyp}($shape, ptr)
+        $ptr = $(emit_shmem(id, llvmtyp, len, align))
+        CuDeviceArray($shape, $ptr)
     end
 end
 
@@ -69,10 +73,11 @@ function emit_static_shmem{N}(id::Integer, jltyp::Type, shape::NTuple{N,<:Intege
     len = prod(shape) * sizeof(jltyp)
     align = datatype_align(jltyp)
 
+    @gensym ptr
     return quote
         Base.@_inline_meta
-        ptr = $(emit_shmem(id, :i8, len, align))
-        CuDeviceArray{$jltyp}($shape, Base.unsafe_convert(Ptr{$jltyp}, ptr))
+        $ptr = $(emit_shmem(id, :i8, len, align))
+        CuDeviceArray($shape, Base.convert(DevicePtr{$jltyp}, $ptr))
     end
 end
 
@@ -82,7 +87,7 @@ end
 
 
 """
-    @cuDynamicSharedMem(typ::Type, dims, offset::Integer=0) -> CuDeviceArray{typ}
+    @cuDynamicSharedMem(typ::Type, dims, offset::Integer=0) -> CuDeviceArray{typ,Shared}
 
 Get an array of type `typ` and dimensions `dims` (either an integer length or tuple shape)
 pointing to a dynamically-allocated piece of shared memory. The type should be statically
@@ -114,10 +119,11 @@ function emit_dynamic_shmem{T<:LLVMTypes}(id::Integer, jltyp::Type{T}, shape::Un
 
     align = datatype_align(jltyp)
 
+    @gensym ptr
     return quote
         Base.@_inline_meta
-        ptr = $(emit_shmem(id, llvmtyp, 0, align)) + $offset
-        CuDeviceArray{$jltyp}($shape, ptr)
+        $ptr = $(emit_shmem(id, llvmtyp, 0, align)) + $offset
+        CuDeviceArray($shape, $ptr)
     end
 end
 
@@ -129,10 +135,11 @@ function emit_dynamic_shmem(id::Integer, jltyp::Type, shape::Union{Expr,Symbol},
 
     align = datatype_align(jltyp)
 
+    @gensym ptr
     return quote
         Base.@_inline_meta
-        ptr = $(emit_shmem(id, :i8, 0, align)) + $offset
-        CuDeviceArray{$jltyp}($shape, Base.unsafe_convert(Ptr{$jltyp}, ptr))
+        $ptr = $(emit_shmem(id, :i8, 0, align)) + $offset
+        CuDeviceArray($shape, Base.convert(DevicePtr{$jltyp}, $ptr))
     end
 end
 

diff --git a/src/execution.jl b/src/execution.jl
@@ -1,6 +1,6 @@
 # Native execution support
 
-export @cuda, nearest_warpsize
+export @cuda, nearest_warpsize, cudaconvert
 
 using Base.Iterators: filter
 
@@ -9,32 +9,8 @@ using Base.Iterators: filter
 # Auxiliary
 #
 
-# Determine which type to pre-convert objects to for use on a CUDA device.
-#
-# The resulting object type will be used as a starting point to determine the final argument
-# types. This is different from `cconvert` in that we don't know which type to convert to.
-function convert_type(t)
-    # NOTE: this conversion was originally intended to be a user-extensible interface,
-    #       a la cconvert (look for cudaconvert in f1e592e61d6898869b918331e3e625292f4c8cab).
-    #
-    #       however, the generated function behind @cuda isn't allowed to call overloaded
-    #       functions (only pure ones), and also won't be able to see functions defined
-    #       after the generated function's body (see JuliaLang/julia#19942).
-
-    # Pointer handling
-    if t <: DevicePtr
-        return Ptr{t.parameters...}
-    elseif t <: Ptr
-        throw(InexactError())
-    end
-
-    # Array types
-    if t <: CuArray
-        return CuDeviceArray{t.parameters...}
-    end
-
-    return t
-end
+# NOTE: this method cannot be extended, because it is used in a generated function
+cudaconvert(::Type{T}) where {T} = T
 
 # Convert the arguments to a kernel function to their CUDA representation, and figure out
 # what types to specialize the kernel function for.
@@ -45,7 +21,7 @@ function convert_arguments(args, types)
     # convert types to their CUDA representation
     for i in 1:length(argexprs)
         t = argtypes[i]
-        ct = convert_type(t)
+        ct = cudaconvert(t)
         if ct != t
             argtypes[i] = ct
             if ct <: Ptr
@@ -56,8 +32,6 @@ function convert_arguments(args, types)
         end
     end
 
-    # NOTE: DevicePtr's should have disappeared after this point
-
     for argtype in argtypes
         if argtype.layout == C_NULL || !Base.datatype_pointerfree(argtype)
             error("don't know how to handle argument of type $argtype")
@@ -108,8 +82,8 @@ the launch should be scheduled.
 
 The `func` argument should be a valid Julia function. It will be compiled to a CUDA function
 upon first use, and to a certain extent arguments will be converted and managed
-automatically. Finally, a call to `cudacall` is performed, scheduling the compiled function
-for execution on the GPU.
+automatically (see [`cudaconvert`](@ref)). Finally, a call to `cudacall` is performed,
+scheduling the compiled function for execution on the GPU.
 """
 macro cuda(config::Expr, callexpr::Expr)
     # sanity checks