From 654fdd8caebed72f116568f8116ace0023e6a364 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 28 Jul 2017 16:41:38 +0200
Subject: [PATCH] Introduce DevicePtr to track address space of pointers.

Optimize loads and stores using LLVM.jl.
---
 REQUIRE                                |   4 +-
 src/CUDAnative.jl                      |  15 ++-
 src/{device/util.jl => cgutils.jl}     |  31 +++++-
 src/device/array.jl                    |  81 ++++++++------
 src/device/intrinsics/memory_shared.jl |  29 +++--
 src/execution.jl                       |  38 +------
 src/pointer.jl                         | 148 +++++++++++++++++++++++++
 test/array.jl                          |  57 +++++++---
 test/codegen.jl                        |  19 ++++
 test/execution.jl                      |  22 ++--
 test/pointer.jl                        |  59 ++++++++++
 test/runtests.jl                       |   1 +
 12 files changed, 398 insertions(+), 106 deletions(-)
 rename src/{device/util.jl => cgutils.jl} (82%)
 create mode 100644 src/pointer.jl
 create mode 100644 test/pointer.jl

diff --git a/REQUIRE b/REQUIRE
index cb8420de..cb9f283f 100644
--- a/REQUIRE
+++ b/REQUIRE
@@ -1,3 +1,3 @@
 julia 0.6
-CUDAdrv 0.4.2
-LLVM 0.3.6
+CUDAdrv 0.5.0
+LLVM 0.3.8
diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl
index 2c0bfe17..21d7d4be 100644
--- a/src/CUDAnative.jl
+++ b/src/CUDAnative.jl
@@ -16,17 +16,22 @@ else
     false
 end
 
+include("cgutils.jl")
+include("pointer.jl")
+
+# needs to be loaded _before_ the compiler infrastructure, because of generated functions
+include(joinpath("device", "array.jl"))
+include(joinpath("device", "intrinsics.jl"))
+include(joinpath("device", "libdevice.jl"))
+
 include("jit.jl")
 include("profile.jl")
-include(joinpath("device", "util.jl"))
-include(joinpath("device", "array.jl"))
-include(joinpath("device", "intrinsics.jl")) # some of these files contain generated functions,
-include(joinpath("device", "libdevice.jl"))  # so should get loaded late (JuliaLang/julia#19942)
 include("execution.jl")
 include("reflection.jl")
 
 const default_device = Ref{CuDevice}()
 const default_context = Ref{CuContext}()
+const jlctx = Ref{LLVM.Context}()
 function __init__()
     if !configured
         warn("CUDAnative.jl has not been configured, and will not work properly.")
@@ -49,6 +54,8 @@ function __init__()
     default_device[] = CuDevice(0)
     default_context[] = CuContext(default_device[])
 
+    jlctx[] = LLVM.Context(cglobal(:jl_LLVMContext, Void))
+
     init_jit()
 end
 
diff --git a/src/device/util.jl b/src/cgutils.jl
similarity index 82%
rename from src/device/util.jl
rename to src/cgutils.jl
index 55920194..f4a8f33d 100644
--- a/src/device/util.jl
+++ b/src/cgutils.jl
@@ -1,4 +1,4 @@
-# Utility functions for implementing intrinsics and other device code
+# Code generation utility functions
 
 # how to map primitive Julia types to LLVM data types
 const llvmtypes = Dict{Type,Symbol}(
@@ -146,3 +146,32 @@ Base.@pure function datatype_align(::Type{T}) where {T}
     field = T.layout + sizeof(UInt32)
     unsafe_load(convert(Ptr{UInt16}, field)) & convert(Int16, 2^9-1)
 end
+
+
+# create an LLVM function, given its return (LLVM) type and a vector of argument types
+function create_llvmf(ret::LLVMType, params::Vector{LLVMType}, name::String="")::LLVM.Function
+    mod = LLVM.Module("llvmcall", jlctx[])
+
+    llvmf_typ = LLVM.FunctionType(ret, params)
+    llvmf = LLVM.Function(mod, name, llvmf_typ)
+    push!(function_attributes(llvmf), EnumAttribute("alwaysinline"))
+
+    return llvmf
+end
+
+# call an LLVM function, given its return (Julia) type, a tuple-type for the arguments,
+# and an expression yielding a tuple of the actual argument values.
+function call_llvmf(llvmf::LLVM.Function, ret::Type, params::Type, args::Expr)
+    quote
+        Base.@_inline_meta
+        Base.llvmcall(LLVM.ref($llvmf), $ret, $params, $args...)
+    end
+end
+
+function Base.convert(::Type{LLVMType}, typ::Type)
+    isboxed_ref = Ref{Bool}()
+    llvmtyp = LLVMType(ccall(:julia_type_to_llvm, LLVM.API.LLVMTypeRef,
+                             (Any, Ptr{Bool}), typ, isboxed_ref))
+    @assert !isboxed_ref[]
+    return llvmtyp
+end
diff --git a/src/device/array.jl b/src/device/array.jl
index eace069d..33b29082 100644
--- a/src/device/array.jl
+++ b/src/device/array.jl
@@ -9,7 +9,8 @@ export
 """
     CuDeviceArray(dims, ptr)
     CuDeviceArray{T}(dims, ptr)
-    CuDeviceArray{T,N}(dims, ptr)
+    CuDeviceArray{T,A}(dims, ptr)
+    CuDeviceArray{T,A,N}(dims, ptr)
 
 Construct an `N`-dimensional dense CUDA device array with element type `T` wrapping a
 pointer, where `N` is determined from the length of `dims` and `T` is determined from the
@@ -23,62 +24,69 @@ CuDeviceArray
 # NOTE: we can't support the typical `tuple or series of integer` style construction,
 #       because we're currently requiring a trailing pointer argument.
 
-struct CuDeviceArray{T,N} <: AbstractArray{T,N}
+struct CuDeviceArray{T,N,A} <: AbstractArray{T,N}
     shape::NTuple{N,Int}
-    ptr::Ptr{T}
+    ptr::DevicePtr{T,A}
 
-    # inner constructors (exact types, ie. Int not <:Integer)
-    CuDeviceArray{T,N}(shape::NTuple{N,Int}, ptr::Ptr{T}) where {T,N} = new(shape, ptr)
+    # inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
+    CuDeviceArray{T,N,A}(shape::NTuple{N,Int}, ptr::DevicePtr{T,A}) where {T,A,N} = new(shape,ptr)
 end
 
-const CuDeviceVector = CuDeviceArray{T,1} where {T}
-const CuDeviceMatrix = CuDeviceArray{T,2} where {T}
+const CuDeviceVector = CuDeviceArray{T,1,A} where {T,A}
+const CuDeviceMatrix = CuDeviceArray{T,2,A} where {T,A}
 
 # outer constructors, non-parameterized
-CuDeviceArray(dims::NTuple{N,<:Integer}, p::Ptr{T})                where {T,N} = CuDeviceArray{T,N}(dims, p)
-CuDeviceArray(len::Integer, p::Ptr{T})                             where {T}   = CuDeviceVector{T}((len,), p)
+CuDeviceArray(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A})                where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
+CuDeviceArray(len::Integer,              p::DevicePtr{T,A})                where {T,A}   = CuDeviceVector{T,A}((len,), p)
 
 # outer constructors, partially parameterized
-(::Type{CuDeviceArray{T}})(dims::NTuple{N,<:Integer}, p::Ptr{T})   where {T,N} = CuDeviceArray{T,N}(dims, p)
-(::Type{CuDeviceArray{T}})(len::Integer, p::Ptr{T})                where {T}   = CuDeviceVector{T}((len,), p)
+(::Type{CuDeviceArray{T}})(dims::NTuple{N,<:Integer},   p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
+(::Type{CuDeviceArray{T}})(len::Integer,                p::DevicePtr{T,A}) where {T,A}   = CuDeviceVector{T,A}((len,), p)
+(::Type{CuDeviceArray{T,N}})(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
+(::Type{CuDeviceVector{T}})(len::Integer,               p::DevicePtr{T,A}) where {T,A}   = CuDeviceVector{T,A}((len,), p)
 
 # outer constructors, fully parameterized
-(::Type{CuDeviceArray{T,N}})(dims::NTuple{N,<:Integer}, p::Ptr{T}) where {T,N} = CuDeviceArray{T,N}(Int.(dims), p)
-(::Type{CuDeviceVector{T}})(len::Integer, p::Ptr{T})               where {T}   = CuDeviceVector{T}((Int(len),), p)
+(::Type{CuDeviceArray{T,N,A}})(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(Int.(dims), p)
+(::Type{CuDeviceVector{T,A}})(len::Integer,               p::DevicePtr{T,A}) where {T,A}   = CuDeviceVector{T,A}((Int(len),), p)
 
-Base.convert(::Type{CuDeviceArray{T,N}}, a::CuArray{T,N}) where {T,N} =
-    CuDeviceArray{T,N}(a.shape, Base.unsafe_convert(Ptr{T}, a.devptr))
 
-Base.unsafe_convert(::Type{Ptr{T}}, a::CuDeviceArray{T}) where {T} = a.ptr::Ptr{T}
+## getters
 
-
-## array interface
+Base.pointer(a::CuDeviceArray) = a.ptr
 
 Base.size(g::CuDeviceArray) = g.shape
 Base.length(g::CuDeviceArray) = prod(g.shape)
 
-@inline function Base.getindex(A::CuDeviceArray{T}, index::Int) where {T}
+
+## conversions
+
+Base.unsafe_convert(::Type{DevicePtr{T,A}}, a::CuDeviceArray{T,N,A}) where {T,A,N} = pointer(a)
+
+# from CuArray
+function Base.convert(::Type{CuDeviceArray{T,N,AS.Global}}, a::CuArray{T,N}) where {T,N}
+    owned_ptr = pointer(a)
+    ptr = Base.unsafe_convert(Ptr{T}, owned_ptr)
+    CuDeviceArray{T,N,AS.Global}(a.shape, DevicePtr{T,AS.Global}(ptr))
+end
+cudaconvert(::Type{CuArray{T,N}}) where {T,N} = CuDeviceArray{T,N,AS.Global}
+
+
+## indexing
+
+@inline function Base.getindex(A::CuDeviceArray{T}, index::Integer) where {T}
     @boundscheck checkbounds(A, index)
     align = datatype_align(T)
-    Base.pointerref(Base.unsafe_convert(Ptr{T}, A), index, align)::T
+    Base.unsafe_load(pointer(A), index, Val{align})::T
 end
 
-@inline function Base.setindex!(A::CuDeviceArray{T}, x, index::Int) where {T}
+@inline function Base.setindex!(A::CuDeviceArray{T}, x, index::Integer) where {T}
     @boundscheck checkbounds(A, index)
     align = datatype_align(T)
-    Base.pointerset(Base.unsafe_convert(Ptr{T}, A), convert(T, x)::T, index, align)
+    Base.unsafe_store!(pointer(A), x, index, Val{align})
 end
 
 Base.IndexStyle(::Type{<:CuDeviceArray}) = Base.IndexLinear()
 
-Base.show(io::IO, a::CuDeviceVector{T}) where {T} =
-    print(io, "$(length(a))-element device array at $(pointer(a))")
-Base.show(io::IO, a::CuDeviceArray{T,N}) where {T,N} =
-    print(io, "$(join(a.shape, '×')) device array at $(pointer(a))")
-
-
-## quirks
-
 # bounds checking is currently broken due to a PTX assembler issue (see #4)
 Base.checkbounds(::CuDeviceArray, I...) = nothing
 
@@ -88,10 +96,19 @@ struct CuBoundsError <: Exception end
 @inline Base.throw_boundserror(A::CuDeviceArray, I) =
     (Base.@_noinline_meta; throw(CuBoundsError()))
 
-# idem
+
+## other
+
+Base.show(io::IO, a::CuDeviceVector) =
+    print(io, "$(length(a))-element device array at $(pointer(a))")
+Base.show(io::IO, a::CuDeviceArray) =
+    print(io, "$(join(a.shape, '×')) device array at $(pointer(a))")
+
+Base.show(io::IO, mime::MIME"text/plain", a::CuDeviceArray) = show(io, a)
+
 function Base.unsafe_view(A::CuDeviceVector{T}, I::Vararg{Base.ViewIndex,1}) where {T}
     Base.@_inline_meta
-    ptr = Base.unsafe_convert(Ptr{T}, A) + (I[1].start-1)*sizeof(T)
+    ptr = pointer(A) + (I[1].start-1)*sizeof(T)
     len = I[1].stop - I[1].start + 1
     return CuDeviceArray(len, ptr)
 end
diff --git a/src/device/intrinsics/memory_shared.jl b/src/device/intrinsics/memory_shared.jl
index 3248a34b..2a2703bc 100644
--- a/src/device/intrinsics/memory_shared.jl
+++ b/src/device/intrinsics/memory_shared.jl
@@ -13,13 +13,16 @@ shmem_id = 0
 function emit_shmem(id, llvmtyp, len, align)
     var = Symbol("@shmem", id)
     jltyp = jltypes[llvmtyp]
+
+    @gensym ptr
     quote
-        Base.llvmcall(
+        $ptr = Base.llvmcall(
             ($"""$var = external addrspace(3) global [$len x $llvmtyp], align $align""",
              $"""%1 = getelementptr inbounds [$len x $llvmtyp], [$len x $llvmtyp] addrspace(3)* $var, i64 0, i64 0
                  %2 = addrspacecast $llvmtyp addrspace(3)* %1 to $llvmtyp addrspace(0)*
                  ret $llvmtyp* %2"""),
             Ptr{$jltyp}, Tuple{})
+        DevicePtr{$jltyp,AS.Shared}($ptr)
     end
 end
 
@@ -28,7 +31,7 @@ end
 #       to `@cuda`
 
 """
-    @cuStaticSharedMem(typ::Type, dims) -> CuDeviceArray{typ}
+    @cuStaticSharedMem(typ::Type, dims) -> CuDeviceArray{typ,Shared}
 
 Get an array of type `typ` and dimensions `dims` (either an integer length or tuple shape)
 pointing to a statically-allocated piece of shared memory. The type should be statically
@@ -53,10 +56,11 @@ function emit_static_shmem{N, T<:LLVMTypes}(id::Integer, jltyp::Type{T}, shape::
     len = prod(shape)
     align = datatype_align(jltyp)
 
+    @gensym ptr
     return quote
         Base.@_inline_meta
-        ptr = $(emit_shmem(id, llvmtyp, len, align))
-        CuDeviceArray{$jltyp}($shape, ptr)
+        $ptr = $(emit_shmem(id, llvmtyp, len, align))
+        CuDeviceArray($shape, $ptr)
     end
 end
 
@@ -69,10 +73,11 @@ function emit_static_shmem{N}(id::Integer, jltyp::Type, shape::NTuple{N,<:Intege
     len = prod(shape) * sizeof(jltyp)
     align = datatype_align(jltyp)
 
+    @gensym ptr
     return quote
         Base.@_inline_meta
-        ptr = $(emit_shmem(id, :i8, len, align))
-        CuDeviceArray{$jltyp}($shape, Base.unsafe_convert(Ptr{$jltyp}, ptr))
+        $ptr = $(emit_shmem(id, :i8, len, align))
+        CuDeviceArray($shape, Base.convert(DevicePtr{$jltyp}, $ptr))
     end
 end
 
@@ -82,7 +87,7 @@ end
 
 
 """
-    @cuDynamicSharedMem(typ::Type, dims, offset::Integer=0) -> CuDeviceArray{typ}
+    @cuDynamicSharedMem(typ::Type, dims, offset::Integer=0) -> CuDeviceArray{typ,Shared}
 
 Get an array of type `typ` and dimensions `dims` (either an integer length or tuple shape)
 pointing to a dynamically-allocated piece of shared memory. The type should be statically
@@ -114,10 +119,11 @@ function emit_dynamic_shmem{T<:LLVMTypes}(id::Integer, jltyp::Type{T}, shape::Un
 
     align = datatype_align(jltyp)
 
+    @gensym ptr
     return quote
         Base.@_inline_meta
-        ptr = $(emit_shmem(id, llvmtyp, 0, align)) + $offset
-        CuDeviceArray{$jltyp}($shape, ptr)
+        $ptr = $(emit_shmem(id, llvmtyp, 0, align)) + $offset
+        CuDeviceArray($shape, $ptr)
     end
 end
 
@@ -129,10 +135,11 @@ function emit_dynamic_shmem(id::Integer, jltyp::Type, shape::Union{Expr,Symbol},
 
     align = datatype_align(jltyp)
 
+    @gensym ptr
     return quote
         Base.@_inline_meta
-        ptr = $(emit_shmem(id, :i8, 0, align)) + $offset
-        CuDeviceArray{$jltyp}($shape, Base.unsafe_convert(Ptr{$jltyp}, ptr))
+        $ptr = $(emit_shmem(id, :i8, 0, align)) + $offset
+        CuDeviceArray($shape, Base.convert(DevicePtr{$jltyp}, $ptr))
     end
 end
 
diff --git a/src/execution.jl b/src/execution.jl
index b97de19f..3eb80673 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -1,6 +1,6 @@
 # Native execution support
 
-export @cuda, nearest_warpsize
+export @cuda, nearest_warpsize, cudaconvert
 
 using Base.Iterators: filter
 
@@ -9,32 +9,8 @@ using Base.Iterators: filter
 # Auxiliary
 #
 
-# Determine which type to pre-convert objects to for use on a CUDA device.
-#
-# The resulting object type will be used as a starting point to determine the final argument
-# types. This is different from `cconvert` in that we don't know which type to convert to.
-function convert_type(t)
-    # NOTE: this conversion was originally intended to be a user-extensible interface,
-    #       a la cconvert (look for cudaconvert in f1e592e61d6898869b918331e3e625292f4c8cab).
-    #
-    #       however, the generated function behind @cuda isn't allowed to call overloaded
-    #       functions (only pure ones), and also won't be able to see functions defined
-    #       after the generated function's body (see JuliaLang/julia#19942).
-
-    # Pointer handling
-    if t <: DevicePtr
-        return Ptr{t.parameters...}
-    elseif t <: Ptr
-        throw(InexactError())
-    end
-
-    # Array types
-    if t <: CuArray
-        return CuDeviceArray{t.parameters...}
-    end
-
-    return t
-end
+# NOTE: this method cannot be extended, because it is used in a generated function
+cudaconvert(::Type{T}) where {T} = T
 
 # Convert the arguments to a kernel function to their CUDA representation, and figure out
 # what types to specialize the kernel function for.
@@ -45,7 +21,7 @@ function convert_arguments(args, types)
     # convert types to their CUDA representation
     for i in 1:length(argexprs)
         t = argtypes[i]
-        ct = convert_type(t)
+        ct = cudaconvert(t)
         if ct != t
             argtypes[i] = ct
             if ct <: Ptr
@@ -56,8 +32,6 @@ function convert_arguments(args, types)
         end
     end
 
-    # NOTE: DevicePtr's should have disappeared after this point
-
     for argtype in argtypes
         if argtype.layout == C_NULL || !Base.datatype_pointerfree(argtype)
             error("don't know how to handle argument of type $argtype")
@@ -108,8 +82,8 @@ the launch should be scheduled.
 
 The `func` argument should be a valid Julia function. It will be compiled to a CUDA function
 upon first use, and to a certain extent arguments will be converted and managed
-automatically. Finally, a call to `cudacall` is performed, scheduling the compiled function
-for execution on the GPU.
+automatically (see [`cudaconvert`](@ref)). Finally, a call to `cudacall` is performed,
+scheduling the compiled function for execution on the GPU.
 """
 macro cuda(config::Expr, callexpr::Expr)
     # sanity checks
diff --git a/src/pointer.jl b/src/pointer.jl
new file mode 100644
index 00000000..938a64b7
--- /dev/null
+++ b/src/pointer.jl
@@ -0,0 +1,148 @@
+# Device pointer with address space information
+
+#
+# Address spaces
+#
+
+export
+    AS, addrspace
+
+abstract type AddressSpace end
+
+module AS
+
+using CUDAnative
+import CUDAnative: AddressSpace
+
+struct Generic  <: AddressSpace end
+struct Global   <: AddressSpace end
+struct Shared   <: AddressSpace end
+struct Constant <: AddressSpace end
+struct Local    <: AddressSpace end
+
+end
+
+
+#
+# Device pointer
+#
+
+struct DevicePtr{T,A}
+    ptr::Ptr{T}
+
+    # inner constructors, fully parameterized
+    DevicePtr{T,A}(ptr::Ptr{T}) where {T,A<:AddressSpace} = new(ptr)
+end
+
+# outer constructors, partially parameterized
+(::Type{DevicePtr{T}})(ptr::Ptr{T}) where {T} = DevicePtr{T,AS.Generic}(ptr)
+
+# outer constructors, non-parameterized
+DevicePtr(ptr::Ptr{T})              where {T} = DevicePtr{T,AS.Generic}(ptr)
+
+
+## getters
+
+Base.pointer(p::DevicePtr) = p.ptr
+
+Base.isnull(p::DevicePtr) = (pointer(p) == C_NULL)
+Base.eltype(::Type{<:DevicePtr{T}}) where {T} = T
+
+addrspace(x) = addrspace(typeof(x))
+addrspace(::Type{DevicePtr{T,A}}) where {T,A} = A
+
+
+## conversions
+
+# between regular and device pointers
+## simple conversions disallowed
+Base.convert(::Type{Ptr{T}}, p::DevicePtr{T})        where {T} = throw(InexactError())
+Base.convert(::Type{<:DevicePtr{T}}, p::Ptr{T})      where {T} = throw(InexactError())
+## unsafe ones are allowed
+Base.unsafe_convert(::Type{Ptr{T}}, p::DevicePtr{T}) where {T} = pointer(p)
+
+# defer conversions to DevicePtr to unsafe_convert
+Base.cconvert(::Type{<:DevicePtr}, x) = x
+
+# between device pointers
+Base.convert(::Type{<:DevicePtr}, p::DevicePtr)                         = throw(InexactError())
+Base.convert(::Type{DevicePtr{T,A}}, p::DevicePtr{T,A})   where {T,A}   = p
+Base.unsafe_convert(::Type{DevicePtr{T,A}}, p::DevicePtr) where {T,A}   = DevicePtr{T,A}(reinterpret(Ptr{T}, pointer(p)))
+## identical addrspaces
+Base.convert(::Type{DevicePtr{T,A}}, p::DevicePtr{U,A}) where {T,U,A} = Base.unsafe_convert(DevicePtr{T,A}, p)
+## convert to & from generic
+Base.convert(::Type{DevicePtr{T,AS.Generic}}, p::DevicePtr)               where {T}     = Base.unsafe_convert(DevicePtr{T,AS.Generic}, p)
+Base.convert(::Type{DevicePtr{T,A}}, p::DevicePtr{U,AS.Generic})          where {T,U,A} = Base.unsafe_convert(DevicePtr{T,A}, p)
+Base.convert(::Type{DevicePtr{T,AS.Generic}}, p::DevicePtr{T,AS.Generic}) where {T}     = p  # avoid ambiguities
+## unspecified, preserve source addrspace
+Base.convert(::Type{DevicePtr{T}}, p::DevicePtr{U,A}) where {T,U,A} = Base.unsafe_convert(DevicePtr{T,A}, p)
+
+
+## limited pointer arithmetic & comparison
+
+Base.:(==)(a::DevicePtr, b::DevicePtr) = pointer(a) == pointer(b) && addrspace(a) == addrspace(b)
+
+Base.isless(x::DevicePtr, y::DevicePtr) = Base.isless(pointer(x), pointer(y))
+Base.:(-)(x::DevicePtr, y::DevicePtr)   = pointer(x) - pointer(y)
+
+Base.:(+)(x::DevicePtr{T,A}, y::Integer) where {T,A} = DevicePtr{T,A}(pointer(x) + y)
+Base.:(-)(x::DevicePtr{T,A}, y::Integer) where {T,A} = DevicePtr{T,A}(pointer(x) - y)
+Base.:(+)(x::Integer, y::DevicePtr) = y + x
+
+
+## memory operations
+
+Base.convert(::Type{Int}, ::Type{AS.Generic})  = 0
+Base.convert(::Type{Int}, ::Type{AS.Global})   = 1
+Base.convert(::Type{Int}, ::Type{AS.Shared})   = 3
+Base.convert(::Type{Int}, ::Type{AS.Constant}) = 4
+Base.convert(::Type{Int}, ::Type{AS.Local})    = 5
+
+@generated function Base.unsafe_load(p::DevicePtr{T,A}, i::Integer=1,
+                                     ::Type{Val{align}}=Val{1}) where {T,A,align}
+    eltyp = convert(LLVMType, T)
+
+    # create a function
+    param_types = [LLVM.PointerType(eltyp),
+                   LLVM.IntType(sizeof(Int)*8, jlctx[])]
+    llvmf = create_llvmf(eltyp, param_types)
+
+    # generate IR
+    Builder(jlctx[]) do builder
+        entry = BasicBlock(llvmf, "entry", jlctx[])
+        position!(builder, entry)
+
+        ptr = gep!(builder, parameters(llvmf)[1], [parameters(llvmf)[2]])
+        ptr_with_as = addrspacecast!(builder, ptr, LLVM.PointerType(eltyp, convert(Int, A)))
+        val = load!(builder, ptr_with_as)
+        alignment!(val, align)
+        ret!(builder, val)
+    end
+
+    call_llvmf(llvmf, T, Tuple{Ptr{T}, Int}, :((pointer(p), Int(i-1))))
+end
+
+@generated function Base.unsafe_store!(p::DevicePtr{T,A}, x, i::Integer=1,
+                                       ::Type{Val{align}}=Val{1}) where {T,A,align}
+    eltyp = convert(LLVMType, T)
+
+    # create a function
+    param_types = [LLVM.PointerType(eltyp), eltyp,
+                   LLVM.IntType(sizeof(Int)*8, jlctx[])]
+    llvmf = create_llvmf(LLVM.VoidType(jlctx[]), param_types)
+
+    # generate IR
+    Builder(jlctx[]) do builder
+        entry = BasicBlock(llvmf, "entry", jlctx[])
+        position!(builder, entry)
+
+        ptr = gep!(builder, parameters(llvmf)[1], [parameters(llvmf)[3]])
+        ptr_with_as = addrspacecast!(builder, ptr, LLVM.PointerType(eltyp, convert(Int, A)))
+        val = parameters(llvmf)[2]
+        inst = store!(builder, val, ptr_with_as)
+        alignment!(inst, align)
+        ret!(builder)
+    end
+
+    call_llvmf(llvmf, Void, Tuple{Ptr{T}, T, Int}, :((pointer(p), convert(T,x), Int(i-1))))
+end
diff --git a/test/array.jl b/test/array.jl
index 93a33056..a2ed806e 100644
--- a/test/array.jl
+++ b/test/array.jl
@@ -6,36 +6,46 @@
     # inner constructors
     let
         p = Ptr{Int}(C_NULL)
-        @on_device CuDeviceArray{Int,1}((1,), $p)
+        dp = CUDAnative.DevicePtr(p)
+        CuDeviceArray{Int,1,AS.Generic}((1,), dp)
     end
 
     # outer constructors
     for I in [Int32,Int64]
         a = I(1)
         b = I(2)
+
         p = Ptr{I}(C_NULL)
+        dp = CUDAnative.DevicePtr(p)
 
         # not parameterized
-        @on_device CuDeviceArray($b, $p)
-        @on_device CuDeviceArray(($b,), $p)
-        @on_device CuDeviceArray(($b,$a), $p)
+        CuDeviceArray(b, dp)
+        CuDeviceArray((b,), dp)
+        CuDeviceArray((b,a), dp)
 
         # partially parameterized
-        @on_device CuDeviceArray{$I}($b, $p)
-        @on_device CuDeviceArray{$I}(($b,), $p)
-        @on_device CuDeviceArray{$I}(($a,$b), $p)
+        CuDeviceArray{I}(b, dp)
+        CuDeviceArray{I}((b,), dp)
+        CuDeviceArray{I}((a,b), dp)
+        CuDeviceArray{I,1}(b, dp)
+        CuDeviceArray{I,1}((b,), dp)
+        @test_throws MethodError CuDeviceArray{I,1}((a,b), dp)
+        @test_throws MethodError CuDeviceArray{I,2}(b, dp)
+        @test_throws MethodError CuDeviceArray{I,2}((b,), dp)
+        CuDeviceArray{I,2}((a,b), dp)
 
         # fully parameterized
-        @on_device CuDeviceArray{$I,1}($b, $p)
-        @on_device CuDeviceArray{$I,1}(($b,), $p)
-        @test_throws ErrorException @on_device CuDeviceArray{$I,1}(($a,$b), $p)
-        @test_throws ErrorException @on_device CuDeviceArray{$I,2}($b, $p)
-        @test_throws ErrorException @on_device CuDeviceArray{$I,2}(($b,), $p)
-        @on_device CuDeviceArray{$I,2}(($a,$b), $p)
+        CuDeviceArray{I,1,AS.Generic}(b, dp)
+        CuDeviceArray{I,1,AS.Generic}((b,), dp)
+        @test_throws MethodError CuDeviceArray{I,1,AS.Generic}((a,b), dp)
+        @test_throws MethodError CuDeviceArray{I,1,AS.Shared}((a,b), dp)
+        @test_throws MethodError CuDeviceArray{I,2,AS.Generic}(b, dp)
+        @test_throws MethodError CuDeviceArray{I,2,AS.Generic}((b,), dp)
+        CuDeviceArray{I,2,AS.Generic}((a,b), dp)
 
         # type aliases
-        @on_device CuDeviceVector{$I}($b, $p)
-        @on_device CuDeviceMatrix{$I}(($a,$b), $p)
+        CuDeviceVector{I}(b, dp)
+        CuDeviceMatrix{I}((a,b), dp)
     end
 end
 
@@ -78,14 +88,14 @@ end
 
     # NOTE: these tests verify that bounds checking is _disabled_ (see #4)
 
-    ir = sprint(io->CUDAnative.code_llvm(io, array_oob_1d, (CuDeviceArray{Int,1},)))
+    ir = sprint(io->CUDAnative.code_llvm(io, array_oob_1d, (CuDeviceArray{Int,1,AS.Global},)))
     @test !contains(ir, "trap")
 
     @eval function array_oob_2d(array)
         return array[1, 1]
     end
 
-    ir = sprint(io->CUDAnative.code_llvm(io, array_oob_2d, (CuDeviceArray{Int,2},)))
+    ir = sprint(io->CUDAnative.code_llvm(io, array_oob_2d, (CuDeviceArray{Int,2,AS.Global},)))
     @test !contains(ir, "trap")
 end
 
@@ -119,4 +129,17 @@ end
 
 ############################################################################################
 
+
+@testset "bug: non-Int index to unsafe_load" begin
+    @eval function array_load_index(a)
+        return a[UInt64(1)]
+    end
+
+    a = [1]
+    p = pointer(a)
+    dp = CUDAnative.DevicePtr(p)
+    da = CUDAnative.CuDeviceArray(1, dp)
+    array_load_index(da)
+end
+
 end
diff --git a/test/codegen.jl b/test/codegen.jl
index ff274515..0b3cc3a7 100644
--- a/test/codegen.jl
+++ b/test/codegen.jl
@@ -81,6 +81,25 @@ end
 end
 end
 
+if Base.VERSION >= v"0.6.1-pre.1"
+    # JuliaLang/julia#22022 is required for AS-specific operations to work
+    # on certain structs, which this test verifies.
+    #
+    # Keep this test disabled until there's at least been one commit
+    # on the release-0.6 branch, which we assume to include #22022.
+
+    @testset "LLVM D32593" begin
+        @eval struct llvm_D32593_struct
+            foo::Float32
+            bar::Float32
+        end
+
+        @eval llvm_D32593(arr) = arr[1].foo
+
+        CUDAnative.code_llvm(DevNull, llvm_D32593, Tuple{CuDeviceVector{llvm_D32593_struct,AS.Global}})
+    end
+end
+
 end
 
 
diff --git a/test/execution.jl b/test/execution.jl
index 000c7a64..27d33bd6 100644
--- a/test/execution.jl
+++ b/test/execution.jl
@@ -137,7 +137,8 @@ len = prod(dims)
     input_dev = CuArray(input)
     output_dev = similar(input_dev)
 
-    @cuda (1,len) exec_pass_ptr(input_dev.devptr, output_dev.devptr)
+    @cuda (1,len) exec_pass_ptr(Base.unsafe_convert(Ptr{Float32}, input_dev),
+                                Base.unsafe_convert(Ptr{Float32}, output_dev))
     output = Array(output_dev)
     @test input ≈ output
 end
@@ -161,7 +162,8 @@ end
     arr_dev = CuArray(arr)
     val_dev = CuArray(val)
 
-    @cuda (1,len) exec_pass_scalar(arr_dev.devptr, val_dev.devptr)
+    @cuda (1,len) exec_pass_scalar(Base.unsafe_convert(Ptr{Float32}, arr_dev),
+                                   Base.unsafe_convert(Ptr{Float32}, val_dev))
     @test arr[dims...] ≈ Array(val_dev)[1]
 end
 
@@ -187,7 +189,8 @@ end
     arr_dev = CuArray(arr)
     val_dev = CuArray(val)
 
-    @cuda (1,len) exec_pass_scalar_devfun(arr_dev.devptr, val_dev.devptr)
+    @cuda (1,len) exec_pass_scalar_devfun(Base.unsafe_convert(Ptr{Float32}, arr_dev),
+                                          Base.unsafe_convert(Ptr{Float32}, val_dev))
     @test arr[dims...] ≈ Array(val_dev)[1]
 end
 
@@ -207,7 +210,7 @@ end
     keeps = (true,)
     d_out = CuArray{Int}(1)
 
-    @cuda (1,1) exec_pass_tuples(keeps, d_out.devptr)
+    @cuda (1,1) exec_pass_tuples(keeps, Base.unsafe_convert(Ptr{Int}, d_out))
     @test Array(d_out) == [1]
 end
 
@@ -231,7 +234,10 @@ end
 
         return nothing
     end
-    @cuda (1,len) exec_pass_ghost(ExecGhost(), d_a.devptr, d_b.devptr, d_c.devptr)
+    @cuda (1,len) exec_pass_ghost(ExecGhost(),
+                                  Base.unsafe_convert(Ptr{Float32}, d_a),
+                                  Base.unsafe_convert(Ptr{Float32}, d_b),
+                                  Base.unsafe_convert(Ptr{Float32}, d_c))
 
     c = Array(d_c)
     @test a+b == c
@@ -245,7 +251,9 @@ end
 
         return nothing
     end
-    @cuda (1,len) exec_pass_ghost_aggregate(ExecGhost(), d_c.devptr, (42,))
+    @cuda (1,len) exec_pass_ghost_aggregate(ExecGhost(),
+                                            Base.unsafe_convert(Ptr{Float32}, d_c),
+                                            (42,))
 
     c = Array(d_c)
     @test all(val->val==42, c)
@@ -263,7 +271,7 @@ end
     A = CuArray(zeros(Float32, (1,)))
     x = Complex64(2,2)
 
-    @cuda (1, 1) exec_pass_immutables(A.devptr, x)
+    @cuda (1, 1) exec_pass_immutables(Base.unsafe_convert(Ptr{Float32}, A), x)
     @test Array(A) == Float32[imag(x)]
 end
 
diff --git a/test/pointer.jl b/test/pointer.jl
new file mode 100644
index 00000000..515d3c67
--- /dev/null
+++ b/test/pointer.jl
@@ -0,0 +1,59 @@
+@testset "pointer" begin
+
+# inner constructors
+
+const generic_null = CUDAnative.DevicePtr{Void,AS.Generic}(C_NULL)
+const global_null = CUDAnative.DevicePtr{Void,AS.Global}(C_NULL)
+const local_null = CUDAnative.DevicePtr{Void,AS.Local}(C_NULL)
+
+const C_NONNULL = Ptr{Void}(1)
+const generic_nonnull = CUDAnative.DevicePtr{Void,AS.Generic}(C_NONNULL)
+const global_nonnull = CUDAnative.DevicePtr{Void,AS.Global}(C_NONNULL)
+const local_nonnull = CUDAnative.DevicePtr{Void,AS.Local}(C_NONNULL)
+
+const C_ONE = Ptr{Int}(1)
+const generic_one = CUDAnative.DevicePtr{Int,AS.Generic}(C_ONE)
+const global_one = CUDAnative.DevicePtr{Int,AS.Global}(C_ONE)
+const local_one = CUDAnative.DevicePtr{Int,AS.Local}(C_ONE)
+
+# outer constructors
+@test CUDAnative.DevicePtr{Void}(C_NULL) == generic_null
+@test CUDAnative.DevicePtr(C_NULL) == generic_null
+
+# getters
+@test eltype(generic_null) == Void
+@test addrspace(generic_null) == AS.Generic
+@test isnull(generic_null)
+@test !isnull(generic_nonnull)
+
+# comparisons
+@test generic_null != generic_one
+@test generic_null != global_null
+@test local_null != global_null
+
+
+@testset "conversions" begin
+
+# between regular and device pointers
+
+@test_throws InexactError convert(Ptr{Void}, generic_null)
+@test_throws InexactError convert(CUDAnative.DevicePtr{Void}, C_NULL)
+
+@test Base.unsafe_convert(Ptr{Void}, generic_null) == C_NULL
+
+
+# between device pointers
+
+@test_throws InexactError convert(typeof(local_null), global_null) == local_null
+@test convert(typeof(generic_null), generic_null) == generic_null
+@test convert(typeof(global_null), global_null) == global_null
+@test Base.unsafe_convert(typeof(local_null), global_null) == local_null
+
+@test convert(typeof(global_null), global_one) == global_nonnull
+@test convert(typeof(generic_null), global_one) == generic_nonnull
+@test convert(typeof(global_null), generic_one) == global_nonnull
+@test convert(CUDAnative.DevicePtr{Void}, global_one) == global_nonnull
+
+end
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 0487897d..c0556c2a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,6 +6,7 @@ using Base.Test
 include("util.jl")
 
 include("base.jl")
+include("pointer.jl")
 
 if CUDAnative.configured
     # requiring a configured LLVM.jl