Skip to content
This repository has been archived by the owner on May 27, 2021. It is now read-only.

Commit

Permalink
Introduce DevicePtr to track address space of pointers.
Browse files Browse the repository at this point in the history
Optimize loads and stores using LLVM.jl.
  • Loading branch information
maleadt committed Aug 1, 2017
1 parent 7120e90 commit 654fdd8
Show file tree
Hide file tree
Showing 12 changed files with 398 additions and 106 deletions.
4 changes: 2 additions & 2 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
julia 0.6
CUDAdrv 0.4.2
LLVM 0.3.6
CUDAdrv 0.5.0
LLVM 0.3.8
15 changes: 11 additions & 4 deletions src/CUDAnative.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,22 @@ else
false
end

include("cgutils.jl")
include("pointer.jl")

# needs to be loaded _before_ the compiler infrastructure, because of generated functions
include(joinpath("device", "array.jl"))
include(joinpath("device", "intrinsics.jl"))
include(joinpath("device", "libdevice.jl"))

include("jit.jl")
include("profile.jl")
include(joinpath("device", "util.jl"))
include(joinpath("device", "array.jl"))
include(joinpath("device", "intrinsics.jl")) # some of these files contain generated functions,
include(joinpath("device", "libdevice.jl")) # so should get loaded late (JuliaLang/julia#19942)
include("execution.jl")
include("reflection.jl")

const default_device = Ref{CuDevice}()
const default_context = Ref{CuContext}()
const jlctx = Ref{LLVM.Context}()
function __init__()
if !configured
warn("CUDAnative.jl has not been configured, and will not work properly.")
Expand All @@ -49,6 +54,8 @@ function __init__()
default_device[] = CuDevice(0)
default_context[] = CuContext(default_device[])

jlctx[] = LLVM.Context(cglobal(:jl_LLVMContext, Void))

init_jit()
end

Expand Down
31 changes: 30 additions & 1 deletion src/device/util.jl → src/cgutils.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Utility functions for implementing intrinsics and other device code
# Code generation utility functions

# how to map primitive Julia types to LLVM data types
const llvmtypes = Dict{Type,Symbol}(
Expand Down Expand Up @@ -146,3 +146,32 @@ Base.@pure function datatype_align(::Type{T}) where {T}
field = T.layout + sizeof(UInt32)
unsafe_load(convert(Ptr{UInt16}, field)) & convert(Int16, 2^9-1)
end


# create an LLVM function, given its return (LLVM) type and a vector of argument types
function create_llvmf(ret::LLVMType, params::Vector{LLVMType}, name::String="")::LLVM.Function
mod = LLVM.Module("llvmcall", jlctx[])

llvmf_typ = LLVM.FunctionType(ret, params)
llvmf = LLVM.Function(mod, name, llvmf_typ)
push!(function_attributes(llvmf), EnumAttribute("alwaysinline"))

return llvmf
end

# call an LLVM function, given its return (Julia) type, a tuple-type for the arguments,
# and an expression yielding a tuple of the actual argument values.
function call_llvmf(llvmf::LLVM.Function, ret::Type, params::Type, args::Expr)
quote
Base.@_inline_meta
Base.llvmcall(LLVM.ref($llvmf), $ret, $params, $args...)
end
end

function Base.convert(::Type{LLVMType}, typ::Type)

This comment has been minimized.

Copy link
@tkelman

tkelman Aug 2, 2017

Contributor

doesn't this belong in LLVM.jl, not here?

This comment has been minimized.

Copy link
@maleadt

maleadt Aug 2, 2017

Author Member

It's somewhat specific for interacting with Julia's LLVM env whereas LLVM.jl aims to be agnostic of that, but you're right that it makes more sense to bundle Julia interop functionality with LLVM.jl. I'd prefer to keep it here for now though, because it is annoying enough already to keep tagging LLVM.jl releases when doing incremental work on CUDAnative; once I've ironed out a stable set of reusable components I'll move them over to LLVM.jl.

isboxed_ref = Ref{Bool}()
llvmtyp = LLVMType(ccall(:julia_type_to_llvm, LLVM.API.LLVMTypeRef,
(Any, Ptr{Bool}), typ, isboxed_ref))
@assert !isboxed_ref[]
return llvmtyp
end
81 changes: 49 additions & 32 deletions src/device/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ export
"""
CuDeviceArray(dims, ptr)
CuDeviceArray{T}(dims, ptr)
CuDeviceArray{T,N}(dims, ptr)
CuDeviceArray{T,A}(dims, ptr)
CuDeviceArray{T,A,N}(dims, ptr)
Construct an `N`-dimensional dense CUDA device array with element type `T` wrapping a
pointer, where `N` is determined from the length of `dims` and `T` is determined from the
Expand All @@ -23,62 +24,69 @@ CuDeviceArray
# NOTE: we can't support the typical `tuple or series of integer` style construction,
# because we're currently requiring a trailing pointer argument.

struct CuDeviceArray{T,N} <: AbstractArray{T,N}
struct CuDeviceArray{T,N,A} <: AbstractArray{T,N}
shape::NTuple{N,Int}
ptr::Ptr{T}
ptr::DevicePtr{T,A}

# inner constructors (exact types, ie. Int not <:Integer)
CuDeviceArray{T,N}(shape::NTuple{N,Int}, ptr::Ptr{T}) where {T,N} = new(shape, ptr)
# inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
CuDeviceArray{T,N,A}(shape::NTuple{N,Int}, ptr::DevicePtr{T,A}) where {T,A,N} = new(shape,ptr)
end

const CuDeviceVector = CuDeviceArray{T,1} where {T}
const CuDeviceMatrix = CuDeviceArray{T,2} where {T}
const CuDeviceVector = CuDeviceArray{T,1,A} where {T,A}
const CuDeviceMatrix = CuDeviceArray{T,2,A} where {T,A}

# outer constructors, non-parameterized
CuDeviceArray(dims::NTuple{N,<:Integer}, p::Ptr{T}) where {T,N} = CuDeviceArray{T,N}(dims, p)
CuDeviceArray(len::Integer, p::Ptr{T}) where {T} = CuDeviceVector{T}((len,), p)
CuDeviceArray(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
CuDeviceArray(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((len,), p)

# outer constructors, partially parameterized
(::Type{CuDeviceArray{T}})(dims::NTuple{N,<:Integer}, p::Ptr{T}) where {T,N} = CuDeviceArray{T,N}(dims, p)
(::Type{CuDeviceArray{T}})(len::Integer, p::Ptr{T}) where {T} = CuDeviceVector{T}((len,), p)
(::Type{CuDeviceArray{T}})(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
(::Type{CuDeviceArray{T}})(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((len,), p)
(::Type{CuDeviceArray{T,N}})(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
(::Type{CuDeviceVector{T}})(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((len,), p)

# outer constructors, fully parameterized
(::Type{CuDeviceArray{T,N}})(dims::NTuple{N,<:Integer}, p::Ptr{T}) where {T,N} = CuDeviceArray{T,N}(Int.(dims), p)
(::Type{CuDeviceVector{T}})(len::Integer, p::Ptr{T}) where {T} = CuDeviceVector{T}((Int(len),), p)
(::Type{CuDeviceArray{T,N,A}})(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(Int.(dims), p)
(::Type{CuDeviceVector{T,A}})(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((Int(len),), p)

Base.convert(::Type{CuDeviceArray{T,N}}, a::CuArray{T,N}) where {T,N} =
CuDeviceArray{T,N}(a.shape, Base.unsafe_convert(Ptr{T}, a.devptr))

Base.unsafe_convert(::Type{Ptr{T}}, a::CuDeviceArray{T}) where {T} = a.ptr::Ptr{T}
## getters


## array interface
Base.pointer(a::CuDeviceArray) = a.ptr

Base.size(g::CuDeviceArray) = g.shape
Base.length(g::CuDeviceArray) = prod(g.shape)

@inline function Base.getindex(A::CuDeviceArray{T}, index::Int) where {T}

## conversions

Base.unsafe_convert(::Type{DevicePtr{T,A}}, a::CuDeviceArray{T,N,A}) where {T,A,N} = pointer(a)

# from CuArray
function Base.convert(::Type{CuDeviceArray{T,N,AS.Global}}, a::CuArray{T,N}) where {T,N}
owned_ptr = pointer(a)
ptr = Base.unsafe_convert(Ptr{T}, owned_ptr)
CuDeviceArray{T,N,AS.Global}(a.shape, DevicePtr{T,AS.Global}(ptr))
end
cudaconvert(::Type{CuArray{T,N}}) where {T,N} = CuDeviceArray{T,N,AS.Global}


## indexing

@inline function Base.getindex(A::CuDeviceArray{T}, index::Integer) where {T}
@boundscheck checkbounds(A, index)
align = datatype_align(T)
Base.pointerref(Base.unsafe_convert(Ptr{T}, A), index, align)::T
Base.unsafe_load(pointer(A), index, Val{align})::T
end

@inline function Base.setindex!(A::CuDeviceArray{T}, x, index::Int) where {T}
@inline function Base.setindex!(A::CuDeviceArray{T}, x, index::Integer) where {T}
@boundscheck checkbounds(A, index)
align = datatype_align(T)
Base.pointerset(Base.unsafe_convert(Ptr{T}, A), convert(T, x)::T, index, align)
Base.unsafe_store!(pointer(A), x, index, Val{align})
end

Base.IndexStyle(::Type{<:CuDeviceArray}) = Base.IndexLinear()

Base.show(io::IO, a::CuDeviceVector{T}) where {T} =
print(io, "$(length(a))-element device array at $(pointer(a))")
Base.show(io::IO, a::CuDeviceArray{T,N}) where {T,N} =
print(io, "$(join(a.shape, '×')) device array at $(pointer(a))")


## quirks

# bounds checking is currently broken due to a PTX assembler issue (see #4)
Base.checkbounds(::CuDeviceArray, I...) = nothing

Expand All @@ -88,10 +96,19 @@ struct CuBoundsError <: Exception end
@inline Base.throw_boundserror(A::CuDeviceArray, I) =
(Base.@_noinline_meta; throw(CuBoundsError()))

# idem

## other

Base.show(io::IO, a::CuDeviceVector) =
print(io, "$(length(a))-element device array at $(pointer(a))")
Base.show(io::IO, a::CuDeviceArray) =
print(io, "$(join(a.shape, '×')) device array at $(pointer(a))")

Base.show(io::IO, mime::MIME"text/plain", a::CuDeviceArray) = show(io, a)

function Base.unsafe_view(A::CuDeviceVector{T}, I::Vararg{Base.ViewIndex,1}) where {T}
Base.@_inline_meta
ptr = Base.unsafe_convert(Ptr{T}, A) + (I[1].start-1)*sizeof(T)
ptr = pointer(A) + (I[1].start-1)*sizeof(T)
len = I[1].stop - I[1].start + 1
return CuDeviceArray(len, ptr)
end
29 changes: 18 additions & 11 deletions src/device/intrinsics/memory_shared.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@ shmem_id = 0
function emit_shmem(id, llvmtyp, len, align)
var = Symbol("@shmem", id)
jltyp = jltypes[llvmtyp]

@gensym ptr
quote
Base.llvmcall(
$ptr = Base.llvmcall(
($"""$var = external addrspace(3) global [$len x $llvmtyp], align $align""",
$"""%1 = getelementptr inbounds [$len x $llvmtyp], [$len x $llvmtyp] addrspace(3)* $var, i64 0, i64 0
%2 = addrspacecast $llvmtyp addrspace(3)* %1 to $llvmtyp addrspace(0)*
ret $llvmtyp* %2"""),
Ptr{$jltyp}, Tuple{})
DevicePtr{$jltyp,AS.Shared}($ptr)
end
end

Expand All @@ -28,7 +31,7 @@ end
# to `@cuda`

"""
@cuStaticSharedMem(typ::Type, dims) -> CuDeviceArray{typ}
@cuStaticSharedMem(typ::Type, dims) -> CuDeviceArray{typ,Shared}
Get an array of type `typ` and dimensions `dims` (either an integer length or tuple shape)
pointing to a statically-allocated piece of shared memory. The type should be statically
Expand All @@ -53,10 +56,11 @@ function emit_static_shmem{N, T<:LLVMTypes}(id::Integer, jltyp::Type{T}, shape::
len = prod(shape)
align = datatype_align(jltyp)

@gensym ptr
return quote
Base.@_inline_meta
ptr = $(emit_shmem(id, llvmtyp, len, align))
CuDeviceArray{$jltyp}($shape, ptr)
$ptr = $(emit_shmem(id, llvmtyp, len, align))
CuDeviceArray($shape, $ptr)
end
end

Expand All @@ -69,10 +73,11 @@ function emit_static_shmem{N}(id::Integer, jltyp::Type, shape::NTuple{N,<:Intege
len = prod(shape) * sizeof(jltyp)
align = datatype_align(jltyp)

@gensym ptr
return quote
Base.@_inline_meta
ptr = $(emit_shmem(id, :i8, len, align))
CuDeviceArray{$jltyp}($shape, Base.unsafe_convert(Ptr{$jltyp}, ptr))
$ptr = $(emit_shmem(id, :i8, len, align))
CuDeviceArray($shape, Base.convert(DevicePtr{$jltyp}, $ptr))
end
end

Expand All @@ -82,7 +87,7 @@ end


"""
@cuDynamicSharedMem(typ::Type, dims, offset::Integer=0) -> CuDeviceArray{typ}
@cuDynamicSharedMem(typ::Type, dims, offset::Integer=0) -> CuDeviceArray{typ,Shared}
Get an array of type `typ` and dimensions `dims` (either an integer length or tuple shape)
pointing to a dynamically-allocated piece of shared memory. The type should be statically
Expand Down Expand Up @@ -114,10 +119,11 @@ function emit_dynamic_shmem{T<:LLVMTypes}(id::Integer, jltyp::Type{T}, shape::Un

align = datatype_align(jltyp)

@gensym ptr
return quote
Base.@_inline_meta
ptr = $(emit_shmem(id, llvmtyp, 0, align)) + $offset
CuDeviceArray{$jltyp}($shape, ptr)
$ptr = $(emit_shmem(id, llvmtyp, 0, align)) + $offset
CuDeviceArray($shape, $ptr)
end
end

Expand All @@ -129,10 +135,11 @@ function emit_dynamic_shmem(id::Integer, jltyp::Type, shape::Union{Expr,Symbol},

align = datatype_align(jltyp)

@gensym ptr
return quote
Base.@_inline_meta
ptr = $(emit_shmem(id, :i8, 0, align)) + $offset
CuDeviceArray{$jltyp}($shape, Base.unsafe_convert(Ptr{$jltyp}, ptr))
$ptr = $(emit_shmem(id, :i8, 0, align)) + $offset
CuDeviceArray($shape, Base.convert(DevicePtr{$jltyp}, $ptr))
end
end

Expand Down
38 changes: 6 additions & 32 deletions src/execution.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Native execution support

export @cuda, nearest_warpsize
export @cuda, nearest_warpsize, cudaconvert

using Base.Iterators: filter

Expand All @@ -9,32 +9,8 @@ using Base.Iterators: filter
# Auxiliary
#

# Determine which type to pre-convert objects to for use on a CUDA device.
#
# The resulting object type will be used as a starting point to determine the final argument
# types. This is different from `cconvert` in that we don't know which type to convert to.
function convert_type(t)
# NOTE: this conversion was originally intended to be a user-extensible interface,
# a la cconvert (look for cudaconvert in f1e592e61d6898869b918331e3e625292f4c8cab).
#
# however, the generated function behind @cuda isn't allowed to call overloaded
# functions (only pure ones), and also won't be able to see functions defined
# after the generated function's body (see JuliaLang/julia#19942).

# Pointer handling
if t <: DevicePtr
return Ptr{t.parameters...}
elseif t <: Ptr
throw(InexactError())
end

# Array types
if t <: CuArray
return CuDeviceArray{t.parameters...}
end

return t
end
# NOTE: this method cannot be extended, because it is used in a generated function
cudaconvert(::Type{T}) where {T} = T

# Convert the arguments to a kernel function to their CUDA representation, and figure out
# what types to specialize the kernel function for.
Expand All @@ -45,7 +21,7 @@ function convert_arguments(args, types)
# convert types to their CUDA representation
for i in 1:length(argexprs)
t = argtypes[i]
ct = convert_type(t)
ct = cudaconvert(t)
if ct != t
argtypes[i] = ct
if ct <: Ptr
Expand All @@ -56,8 +32,6 @@ function convert_arguments(args, types)
end
end

# NOTE: DevicePtr's should have disappeared after this point

for argtype in argtypes
if argtype.layout == C_NULL || !Base.datatype_pointerfree(argtype)
error("don't know how to handle argument of type $argtype")
Expand Down Expand Up @@ -108,8 +82,8 @@ the launch should be scheduled.
The `func` argument should be a valid Julia function. It will be compiled to a CUDA function
upon first use, and to a certain extent arguments will be converted and managed
automatically. Finally, a call to `cudacall` is performed, scheduling the compiled function
for execution on the GPU.
automatically (see [`cudaconvert`](@ref)). Finally, a call to `cudacall` is performed,
scheduling the compiled function for execution on the GPU.
"""
macro cuda(config::Expr, callexpr::Expr)
# sanity checks
Expand Down
Loading

0 comments on commit 654fdd8

Please sign in to comment.