Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

at-roc: Add boundscheck flag #382

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions src/compiler/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,22 @@ function delete_exception_user!(mod::LLVM.Module)
end
@assert !haskey(LLVM.functions(mod), "__fake_global_exception_flag_user")
end
function replace_boundscheck!(mod::LLVM.Module, boundscheck::Bool)
if haskey(LLVM.globals(mod), "__global_boundscheck")
gbl = LLVM.globals(mod)["__global_boundscheck"]
linkage!(gbl, LLVM.API.LLVMPrivateLinkage)
constant!(gbl, true)
extinit!(gbl, false)
initializer!(gbl, ConstantInt(UInt8(boundscheck); ctx=context(mod)))
end
end

## GPUCompiler interface

struct ROCCompilerParams <: AbstractCompilerParams
device::ROCDevice
global_hooks::NamedTuple
boundscheck::Bool
end

const ROCCompilerJob = CompilerJob{GCNCompilerTarget,ROCCompilerParams}
Expand All @@ -70,6 +80,9 @@ function GPUCompiler.process_module!(job::ROCCompilerJob, mod::LLVM.Module)
job, mod)
# Run this early (before optimization) to ensure we link OCKL
emit_exception_user!(mod)

# Replace access to boundscheck flag early to enable optimizations
replace_boundscheck!(mod, job.params.boundscheck)
end
function GPUCompiler.process_entry!(job::ROCCompilerJob, mod::LLVM.Module, entry::LLVM.Function)
invoke(GPUCompiler.process_entry!,
Expand Down Expand Up @@ -151,14 +164,14 @@ The output of this function is automatically cached, i.e. you can simply call
generated automatically, when function definitions change, or when different
types or keyword arguments are provided.
"""
function rocfunction(f::F, tt::Type=Tuple{}; name=nothing, device=AMDGPU.default_device(), global_hooks=NamedTuple()) where {F <: Core.Function}
function rocfunction(f::F, tt::Type=Tuple{}; name=nothing, device=AMDGPU.default_device(), global_hooks=NamedTuple(), boundscheck::Bool=true) where {F <: Core.Function}
source = FunctionSpec(F, tt, true, name)
cache = get!(()->Dict{UInt, Any}(), rocfunction_cache, device)

isa = AMDGPU.default_isa(device)
dev_isa, features = Runtime.llvm_arch_features(isa)
target = GCNCompilerTarget(; dev_isa, features)
params = ROCCompilerParams(device, global_hooks)
params = ROCCompilerParams(device, global_hooks, boundscheck)
job = CompilerJob(target, source, params; always_inline=true)
@debug "Compiling $f($(join(tt.parameters, ", ")))"
Runtime.@log_start(:cached_compile, (;f=F, tt), nothing)
Expand Down Expand Up @@ -213,7 +226,9 @@ function rocfunction_link(@nospecialize(job::CompilerJob), compiled)
# initialize globals from hooks
for gname in first.(globals)
hook = nothing
if haskey(default_global_hooks, gname)
if gname == :__global_boundscheck
hook = boundscheck_hook(job.params.boundscheck)
elseif haskey(default_global_hooks, gname)
hook = default_global_hooks[gname]
elseif haskey(global_hooks, gname)
hook = global_hooks[gname]
Expand Down
7 changes: 7 additions & 0 deletions src/compiler/global-hooks.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
const default_global_hooks = Dict{Symbol,Function}()

function boundscheck_hook(boundscheck::Bool)
return (gbl, mod, device) -> begin
gbl_ptr = Base.unsafe_convert(Ptr{UInt8}, gbl)
Base.unsafe_store!(gbl_ptr, UInt8(boundscheck))
end
end

default_global_hooks[:__global_output_context] = (gbl, mod, device) -> begin
# initialize global output context
gbl_ptr = Base.unsafe_convert(Ptr{AMDGPU.Device.GLOBAL_OUTPUT_CONTEXT_TYPE}, gbl)
Expand Down
11 changes: 9 additions & 2 deletions src/device/gcn/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,21 @@ Base.unsafe_convert(::Type{LLVMPtr{T,A}}, a::ROCDeviceArray{T,N,A}) where {T,A,N
Base.datatype_alignment(T)
end

@inline boundscheck_enabled() =
unsafe_load(get_global_pointer(Val(:__global_boundscheck), Bool))

@device_function @inline function Base.getindex(A::ROCDeviceArray{T}, index::Integer) where {T}
@boundscheck checkbounds(A, index)
if boundscheck_enabled()
@boundscheck checkbounds(A, index)
end
align = alignment(A)
Base.unsafe_load(pointer(A), index, Val(align))::T
end

@device_function @inline function Base.setindex!(A::ROCDeviceArray{T}, x, index::Integer) where {T}
@boundscheck checkbounds(A, index)
if boundscheck_enabled()
@boundscheck checkbounds(A, index)
end
align = alignment(A)
Base.unsafe_store!(pointer(A), x, index, Val(align))
return A
Expand Down
3 changes: 2 additions & 1 deletion src/highlevel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ rocconvert(arg) = adapt(Runtime.Adaptor(), arg)
function split_kwargs(kwargs)
alias_kws = Dict(:stream=>:queue)
macro_kws = [:dynamic, :launch, :wait, :mark]
compiler_kws = [:name, :global_hooks]
compiler_kws = [:name, :global_hooks, :boundscheck]
call_kws = [:gridsize, :groupsize, :config]
signal_kws = [:queue, :signal, :soft, :minlat, :timeout]
kernel_kws = [:localmem]
Expand Down Expand Up @@ -322,6 +322,7 @@ Keyword arguments that affect various parts of `@roc`:
Keyword arguments that control kernel compilation via [`rocfunction`](@ref) and [`dynamic_rocfunction`](@ref):
- `name::Union{String,Nothing} = nothing`: If not `nothing`, the name to use for the generated kernel.
- `global_hooks::NamedTuple = (;)`: The set of global compiler hooks to use to initialize memory accessed by the kernel. See `AMDGPU.Compiler.default_global_hooks` for an example of how to implement these.
- `boundscheck::Bool = true`: If `false`, disables all boundschecking within the kernel. The default of `true` enables boundschecking unless `@inbounds` is used.

Keyword arguments that control signal creation via [`AMDGPU.create_event`](@ref):
- `signal::ROCSignal = ROCSignal()`: The underlying signal object to associate the high-level `ROCKernelSignal` with.
Expand Down