Skip to content
This repository was archived by the owner on May 27, 2021. It is now read-only.

Add DevicePtr parameterized on address space #84

Merged
merged 3 commits into from
Aug 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
julia 0.6
CUDAdrv 0.4.2
LLVM 0.3.6
CUDAdrv 0.5.0
LLVM 0.3.8
1 change: 1 addition & 0 deletions docs/src/lib/compilation.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@

```@docs
CUDAnative.@cuda
CUDAnative.cudaconvert
CUDAnative.nearest_warpsize
```
15 changes: 11 additions & 4 deletions src/CUDAnative.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,22 @@ else
false
end

include("cgutils.jl")
include("pointer.jl")

# needs to be loaded _before_ the compiler infrastructure, because of generated functions
include(joinpath("device", "array.jl"))
include(joinpath("device", "intrinsics.jl"))
include(joinpath("device", "libdevice.jl"))

include("jit.jl")
include("profile.jl")
include(joinpath("device", "util.jl"))
include(joinpath("device", "array.jl"))
include(joinpath("device", "intrinsics.jl")) # some of these files contain generated functions,
include(joinpath("device", "libdevice.jl")) # so should get loaded late (JuliaLang/julia#19942)
include("execution.jl")
include("reflection.jl")

const default_device = Ref{CuDevice}()
const default_context = Ref{CuContext}()
const jlctx = Ref{LLVM.Context}()
function __init__()
if !configured
warn("CUDAnative.jl has not been configured, and will not work properly.")
Expand All @@ -49,6 +54,8 @@ function __init__()
default_device[] = CuDevice(0)
default_context[] = CuContext(default_device[])

jlctx[] = LLVM.Context(cglobal(:jl_LLVMContext, Void))

init_jit()
end

Expand Down
31 changes: 30 additions & 1 deletion src/device/util.jl → src/cgutils.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Utility functions for implementing intrinsics and other device code
# Code generation utility functions

# how to map primitive Julia types to LLVM data types
const llvmtypes = Dict{Type,Symbol}(
Expand Down Expand Up @@ -146,3 +146,32 @@ Base.@pure function datatype_align(::Type{T}) where {T}
field = T.layout + sizeof(UInt32)
unsafe_load(convert(Ptr{UInt16}, field)) & convert(Int16, 2^9-1)
end


# create an LLVM function, given its return (LLVM) type and a vector of argument types
function create_llvmf(ret::LLVMType, params::Vector{LLVMType}, name::String="")::LLVM.Function
mod = LLVM.Module("llvmcall", jlctx[])

llvmf_typ = LLVM.FunctionType(ret, params)
llvmf = LLVM.Function(mod, name, llvmf_typ)
push!(function_attributes(llvmf), EnumAttribute("alwaysinline"))

return llvmf
end

# call an LLVM function, given its return (Julia) type, a tuple-type for the arguments,
# and an expression yielding a tuple of the actual argument values.
function call_llvmf(llvmf::LLVM.Function, ret::Type, params::Type, args::Expr)
quote
Base.@_inline_meta
Base.llvmcall(LLVM.ref($llvmf), $ret, $params, $args...)
end
end

function Base.convert(::Type{LLVMType}, typ::Type)
isboxed_ref = Ref{Bool}()
llvmtyp = LLVMType(ccall(:julia_type_to_llvm, LLVM.API.LLVMTypeRef,
(Any, Ptr{Bool}), typ, isboxed_ref))
@assert !isboxed_ref[]
return llvmtyp
end
81 changes: 49 additions & 32 deletions src/device/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ export
"""
CuDeviceArray(dims, ptr)
CuDeviceArray{T}(dims, ptr)
CuDeviceArray{T,N}(dims, ptr)
CuDeviceArray{T,A}(dims, ptr)
CuDeviceArray{T,A,N}(dims, ptr)

Construct an `N`-dimensional dense CUDA device array with element type `T` wrapping a
pointer, where `N` is determined from the length of `dims` and `T` is determined from the
Expand All @@ -23,62 +24,69 @@ CuDeviceArray
# NOTE: we can't support the typical `tuple or series of integer` style construction,
# because we're currently requiring a trailing pointer argument.

struct CuDeviceArray{T,N} <: AbstractArray{T,N}
struct CuDeviceArray{T,N,A} <: AbstractArray{T,N}
shape::NTuple{N,Int}
ptr::Ptr{T}
ptr::DevicePtr{T,A}

# inner constructors (exact types, ie. Int not <:Integer)
CuDeviceArray{T,N}(shape::NTuple{N,Int}, ptr::Ptr{T}) where {T,N} = new(shape, ptr)
# inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
CuDeviceArray{T,N,A}(shape::NTuple{N,Int}, ptr::DevicePtr{T,A}) where {T,A,N} = new(shape,ptr)
end

const CuDeviceVector = CuDeviceArray{T,1} where {T}
const CuDeviceMatrix = CuDeviceArray{T,2} where {T}
const CuDeviceVector = CuDeviceArray{T,1,A} where {T,A}
const CuDeviceMatrix = CuDeviceArray{T,2,A} where {T,A}

# outer constructors, non-parameterized
CuDeviceArray(dims::NTuple{N,<:Integer}, p::Ptr{T}) where {T,N} = CuDeviceArray{T,N}(dims, p)
CuDeviceArray(len::Integer, p::Ptr{T}) where {T} = CuDeviceVector{T}((len,), p)
CuDeviceArray(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
CuDeviceArray(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((len,), p)

# outer constructors, partially parameterized
(::Type{CuDeviceArray{T}})(dims::NTuple{N,<:Integer}, p::Ptr{T}) where {T,N} = CuDeviceArray{T,N}(dims, p)
(::Type{CuDeviceArray{T}})(len::Integer, p::Ptr{T}) where {T} = CuDeviceVector{T}((len,), p)
(::Type{CuDeviceArray{T}})(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
(::Type{CuDeviceArray{T}})(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((len,), p)
(::Type{CuDeviceArray{T,N}})(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
(::Type{CuDeviceVector{T}})(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((len,), p)

# outer constructors, fully parameterized
(::Type{CuDeviceArray{T,N}})(dims::NTuple{N,<:Integer}, p::Ptr{T}) where {T,N} = CuDeviceArray{T,N}(Int.(dims), p)
(::Type{CuDeviceVector{T}})(len::Integer, p::Ptr{T}) where {T} = CuDeviceVector{T}((Int(len),), p)
(::Type{CuDeviceArray{T,N,A}})(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(Int.(dims), p)
(::Type{CuDeviceVector{T,A}})(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((Int(len),), p)

Base.convert(::Type{CuDeviceArray{T,N}}, a::CuArray{T,N}) where {T,N} =
CuDeviceArray{T,N}(a.shape, Base.unsafe_convert(Ptr{T}, a.devptr))

Base.unsafe_convert(::Type{Ptr{T}}, a::CuDeviceArray{T}) where {T} = a.ptr::Ptr{T}
## getters


## array interface
Base.pointer(a::CuDeviceArray) = a.ptr

Base.size(g::CuDeviceArray) = g.shape
Base.length(g::CuDeviceArray) = prod(g.shape)

@inline function Base.getindex(A::CuDeviceArray{T}, index::Int) where {T}

## conversions

Base.unsafe_convert(::Type{DevicePtr{T,A}}, a::CuDeviceArray{T,N,A}) where {T,A,N} = pointer(a)

# from CuArray
function Base.convert(::Type{CuDeviceArray{T,N,AS.Global}}, a::CuArray{T,N}) where {T,N}
owned_ptr = pointer(a)
ptr = Base.unsafe_convert(Ptr{T}, owned_ptr)
CuDeviceArray{T,N,AS.Global}(a.shape, DevicePtr{T,AS.Global}(ptr))
end
cudaconvert(a::CuArray{T,N}) where {T,N} = convert(CuDeviceArray{T,N,AS.Global}, a)


## indexing

@inline function Base.getindex(A::CuDeviceArray{T}, index::Integer) where {T}
@boundscheck checkbounds(A, index)
align = datatype_align(T)
Base.pointerref(Base.unsafe_convert(Ptr{T}, A), index, align)::T
Base.unsafe_load(pointer(A), index, Val{align})::T
end

@inline function Base.setindex!(A::CuDeviceArray{T}, x, index::Int) where {T}
@inline function Base.setindex!(A::CuDeviceArray{T}, x, index::Integer) where {T}
@boundscheck checkbounds(A, index)
align = datatype_align(T)
Base.pointerset(Base.unsafe_convert(Ptr{T}, A), convert(T, x)::T, index, align)
Base.unsafe_store!(pointer(A), x, index, Val{align})
end

Base.IndexStyle(::Type{<:CuDeviceArray}) = Base.IndexLinear()

Base.show(io::IO, a::CuDeviceVector{T}) where {T} =
print(io, "$(length(a))-element device array at $(pointer(a))")
Base.show(io::IO, a::CuDeviceArray{T,N}) where {T,N} =
print(io, "$(join(a.shape, '×')) device array at $(pointer(a))")


## quirks

# bounds checking is currently broken due to a PTX assembler issue (see #4)
Base.checkbounds(::CuDeviceArray, I...) = nothing

Expand All @@ -88,10 +96,19 @@ struct CuBoundsError <: Exception end
@inline Base.throw_boundserror(A::CuDeviceArray, I) =
(Base.@_noinline_meta; throw(CuBoundsError()))

# idem

## other

Base.show(io::IO, a::CuDeviceVector) =
print(io, "$(length(a))-element device array at $(pointer(a))")
Base.show(io::IO, a::CuDeviceArray) =
print(io, "$(join(a.shape, '×')) device array at $(pointer(a))")

Base.show(io::IO, mime::MIME"text/plain", a::CuDeviceArray) = show(io, a)

function Base.unsafe_view(A::CuDeviceVector{T}, I::Vararg{Base.ViewIndex,1}) where {T}
Base.@_inline_meta
ptr = Base.unsafe_convert(Ptr{T}, A) + (I[1].start-1)*sizeof(T)
ptr = pointer(A) + (I[1].start-1)*sizeof(T)
len = I[1].stop - I[1].start + 1
return CuDeviceArray(len, ptr)
end
29 changes: 18 additions & 11 deletions src/device/intrinsics/memory_shared.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@ shmem_id = 0
function emit_shmem(id, llvmtyp, len, align)
var = Symbol("@shmem", id)
jltyp = jltypes[llvmtyp]

@gensym ptr
quote
Base.llvmcall(
$ptr = Base.llvmcall(
($"""$var = external addrspace(3) global [$len x $llvmtyp], align $align""",
$"""%1 = getelementptr inbounds [$len x $llvmtyp], [$len x $llvmtyp] addrspace(3)* $var, i64 0, i64 0
%2 = addrspacecast $llvmtyp addrspace(3)* %1 to $llvmtyp addrspace(0)*
ret $llvmtyp* %2"""),
Ptr{$jltyp}, Tuple{})
DevicePtr{$jltyp,AS.Shared}($ptr)
end
end

Expand All @@ -28,7 +31,7 @@ end
# to `@cuda`

"""
@cuStaticSharedMem(typ::Type, dims) -> CuDeviceArray{typ}
@cuStaticSharedMem(typ::Type, dims) -> CuDeviceArray{typ,Shared}

Get an array of type `typ` and dimensions `dims` (either an integer length or tuple shape)
pointing to a statically-allocated piece of shared memory. The type should be statically
Expand All @@ -53,10 +56,11 @@ function emit_static_shmem{N, T<:LLVMTypes}(id::Integer, jltyp::Type{T}, shape::
len = prod(shape)
align = datatype_align(jltyp)

@gensym ptr
return quote
Base.@_inline_meta
ptr = $(emit_shmem(id, llvmtyp, len, align))
CuDeviceArray{$jltyp}($shape, ptr)
$ptr = $(emit_shmem(id, llvmtyp, len, align))
CuDeviceArray($shape, $ptr)
end
end

Expand All @@ -69,10 +73,11 @@ function emit_static_shmem{N}(id::Integer, jltyp::Type, shape::NTuple{N,<:Intege
len = prod(shape) * sizeof(jltyp)
align = datatype_align(jltyp)

@gensym ptr
return quote
Base.@_inline_meta
ptr = $(emit_shmem(id, :i8, len, align))
CuDeviceArray{$jltyp}($shape, Base.unsafe_convert(Ptr{$jltyp}, ptr))
$ptr = $(emit_shmem(id, :i8, len, align))
CuDeviceArray($shape, Base.convert(DevicePtr{$jltyp}, $ptr))
end
end

Expand All @@ -82,7 +87,7 @@ end


"""
@cuDynamicSharedMem(typ::Type, dims, offset::Integer=0) -> CuDeviceArray{typ}
@cuDynamicSharedMem(typ::Type, dims, offset::Integer=0) -> CuDeviceArray{typ,Shared}

Get an array of type `typ` and dimensions `dims` (either an integer length or tuple shape)
pointing to a dynamically-allocated piece of shared memory. The type should be statically
Expand Down Expand Up @@ -114,10 +119,11 @@ function emit_dynamic_shmem{T<:LLVMTypes}(id::Integer, jltyp::Type{T}, shape::Un

align = datatype_align(jltyp)

@gensym ptr
return quote
Base.@_inline_meta
ptr = $(emit_shmem(id, llvmtyp, 0, align)) + $offset
CuDeviceArray{$jltyp}($shape, ptr)
$ptr = $(emit_shmem(id, llvmtyp, 0, align)) + $offset
CuDeviceArray($shape, $ptr)
end
end

Expand All @@ -129,10 +135,11 @@ function emit_dynamic_shmem(id::Integer, jltyp::Type, shape::Union{Expr,Symbol},

align = datatype_align(jltyp)

@gensym ptr
return quote
Base.@_inline_meta
ptr = $(emit_shmem(id, :i8, 0, align)) + $offset
CuDeviceArray{$jltyp}($shape, Base.unsafe_convert(Ptr{$jltyp}, ptr))
$ptr = $(emit_shmem(id, :i8, 0, align)) + $offset
CuDeviceArray($shape, Base.convert(DevicePtr{$jltyp}, $ptr))
end
end

Expand Down
Loading