From 8872a80702a96e21ef6ef06de81acde31bd39c11 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Wed, 8 Feb 2023 17:28:35 -0600 Subject: [PATCH] at-roc: Add boundscheck flag Similar to Julia's `--check-bounds` flag, this flag (used like `@roc boundscheck=false ...`) allows the user to entirely disable boundschecking within their code. --- src/compiler/codegen.jl | 21 ++++++++++++++++++--- src/compiler/global-hooks.jl | 7 +++++++ src/device/gcn/array.jl | 11 +++++++++-- src/highlevel.jl | 3 ++- 4 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/compiler/codegen.jl b/src/compiler/codegen.jl index bf63d8699..31ea8174d 100644 --- a/src/compiler/codegen.jl +++ b/src/compiler/codegen.jl @@ -45,12 +45,22 @@ function delete_exception_user!(mod::LLVM.Module) end @assert !haskey(LLVM.functions(mod), "__fake_global_exception_flag_user") end +function replace_boundscheck!(mod::LLVM.Module, boundscheck::Bool) + if haskey(LLVM.globals(mod), "__global_boundscheck") + gbl = LLVM.globals(mod)["__global_boundscheck"] + linkage!(gbl, LLVM.API.LLVMPrivateLinkage) + constant!(gbl, true) + extinit!(gbl, false) + initializer!(gbl, ConstantInt(UInt8(boundscheck); ctx=context(mod))) + end +end ## GPUCompiler interface struct ROCCompilerParams <: AbstractCompilerParams device::ROCDevice global_hooks::NamedTuple + boundscheck::Bool end const ROCCompilerJob = CompilerJob{GCNCompilerTarget,ROCCompilerParams} @@ -70,6 +80,9 @@ function GPUCompiler.process_module!(job::ROCCompilerJob, mod::LLVM.Module) job, mod) # Run this early (before optimization) to ensure we link OCKL emit_exception_user!(mod) + + # Replace access to boundscheck flag early to enable optimizations + replace_boundscheck!(mod, job.params.boundscheck) end function GPUCompiler.process_entry!(job::ROCCompilerJob, mod::LLVM.Module, entry::LLVM.Function) invoke(GPUCompiler.process_entry!, @@ -151,14 +164,14 @@ The output of this function is automatically cached, i.e. you can simply call generated automatically, when function definitions change, or when different types or keyword arguments are provided. """ -function rocfunction(f::F, tt::Type=Tuple{}; name=nothing, device=AMDGPU.default_device(), global_hooks=NamedTuple()) where {F <: Core.Function} +function rocfunction(f::F, tt::Type=Tuple{}; name=nothing, device=AMDGPU.default_device(), global_hooks=NamedTuple(), boundscheck::Bool=true) where {F <: Core.Function} source = FunctionSpec(F, tt, true, name) cache = get!(()->Dict{UInt, Any}(), rocfunction_cache, device) isa = AMDGPU.default_isa(device) dev_isa, features = Runtime.llvm_arch_features(isa) target = GCNCompilerTarget(; dev_isa, features) - params = ROCCompilerParams(device, global_hooks) + params = ROCCompilerParams(device, global_hooks, boundscheck) job = CompilerJob(target, source, params; always_inline=true) @debug "Compiling $f($(join(tt.parameters, ", ")))" Runtime.@log_start(:cached_compile, (;f=F, tt), nothing) @@ -213,7 +226,9 @@ function rocfunction_link(@nospecialize(job::CompilerJob), compiled) # initialize globals from hooks for gname in first.(globals) hook = nothing - if haskey(default_global_hooks, gname) + if gname == :__global_boundscheck + hook = boundscheck_hook(job.params.boundscheck) + elseif haskey(default_global_hooks, gname) hook = default_global_hooks[gname] elseif haskey(global_hooks, gname) hook = global_hooks[gname] diff --git a/src/compiler/global-hooks.jl b/src/compiler/global-hooks.jl index da7d1edb4..e1c072c1c 100644 --- a/src/compiler/global-hooks.jl +++ b/src/compiler/global-hooks.jl @@ -1,5 +1,12 @@ const default_global_hooks = Dict{Symbol,Function}() +function boundscheck_hook(boundscheck::Bool) + return (gbl, mod, device) -> begin + gbl_ptr = Base.unsafe_convert(Ptr{UInt8}, gbl) + Base.unsafe_store!(gbl_ptr, UInt8(boundscheck)) + end +end + default_global_hooks[:__global_output_context] = (gbl, mod, device) -> begin # initialize global output context gbl_ptr = Base.unsafe_convert(Ptr{AMDGPU.Device.GLOBAL_OUTPUT_CONTEXT_TYPE}, gbl) diff --git a/src/device/gcn/array.jl b/src/device/gcn/array.jl index 8ba74490f..7ccb2b3bc 100644 --- a/src/device/gcn/array.jl +++ b/src/device/gcn/array.jl @@ -83,14 +83,21 @@ Base.unsafe_convert(::Type{LLVMPtr{T,A}}, a::ROCDeviceArray{T,N,A}) where {T,A,N Base.datatype_alignment(T) end +@inline boundscheck_enabled() = + unsafe_load(get_global_pointer(Val(:__global_boundscheck), Bool)) + @device_function @inline function Base.getindex(A::ROCDeviceArray{T}, index::Integer) where {T} - @boundscheck checkbounds(A, index) + if boundscheck_enabled() + @boundscheck checkbounds(A, index) + end align = alignment(A) Base.unsafe_load(pointer(A), index, Val(align))::T end @device_function @inline function Base.setindex!(A::ROCDeviceArray{T}, x, index::Integer) where {T} - @boundscheck checkbounds(A, index) + if boundscheck_enabled() + @boundscheck checkbounds(A, index) + end align = alignment(A) Base.unsafe_store!(pointer(A), x, index, Val(align)) return A diff --git a/src/highlevel.jl b/src/highlevel.jl index 0c90a32c7..772206ed4 100644 --- a/src/highlevel.jl +++ b/src/highlevel.jl @@ -194,7 +194,7 @@ rocconvert(arg) = adapt(Runtime.Adaptor(), arg) function split_kwargs(kwargs) alias_kws = Dict(:stream=>:queue) macro_kws = [:dynamic, :launch, :wait, :mark] - compiler_kws = [:name, :global_hooks] + compiler_kws = [:name, :global_hooks, :boundscheck] call_kws = [:gridsize, :groupsize, :config] signal_kws = [:queue, :signal, :soft, :minlat, :timeout] kernel_kws = [:localmem] @@ -322,6 +322,7 @@ Keyword arguments that affect various parts of `@roc`: Keyword arguments that control kernel compilation via [`rocfunction`](@ref) and [`dynamic_rocfunction`](@ref): - `name::Union{String,Nothing} = nothing`: If not `nothing`, the name to use for the generated kernel. - `global_hooks::NamedTuple = (;)`: The set of global compiler hooks to use to initialize memory accessed by the kernel. See `AMDGPU.Compiler.default_global_hooks` for an example of how to implement these. +- `boundscheck::Bool = true`: If `false`, disables all boundschecking within the kernel. The default of `true` enables boundschecking unless `@inbounds` is used. Keyword arguments that control signal creation via [`AMDGPU.create_event`](@ref): - `signal::ROCSignal = ROCSignal()`: The underlying signal object to associate the high-level `ROCKernelSignal` with.