Skip to content
This repository has been archived by the owner on Mar 12, 2021. It is now read-only.

Commit

Permalink
Merge pull request #627 from JuliaGPU/tb/test_threads
Browse files Browse the repository at this point in the history
Thread safety of memory allocator
  • Loading branch information
maleadt authored Mar 13, 2020
2 parents 98ee4b3 + 7d4ca14 commit e18e339
Show file tree
Hide file tree
Showing 9 changed files with 444 additions and 317 deletions.
1 change: 1 addition & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ include:
image: ubuntu:bionic

variables:
JULIA_NUM_THREADS: '4'
JULIA_DEBUG: 'CuArrays'
CI_APT_INSTALL: 'libgomp1'
NVIDIA_VISIBLE_DEVICES: 'all'
Expand Down
4 changes: 2 additions & 2 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ version = "6.0.1"

[[CUDAnative]]
deps = ["Adapt", "BinaryProvider", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "MacroTools", "Pkg", "Printf", "TimerOutputs"]
git-tree-sha1 = "28ce25dd417c5edca9199071184952cd153ec069"
repo-rev = "674d139a65e642d4332e32b18f4b966f9d19e0f9"
git-tree-sha1 = "e6742ce88d11f1fdf6a9357ba738735f86ce67b5"
repo-rev = "58c6755445c05ff26f1bdc5c12c7ae0aa6c39bc2"
repo-url = "https://github.com/JuliaGPU/CUDAnative.jl.git"
uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
version = "2.10.2"
Expand Down
127 changes: 74 additions & 53 deletions src/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ using TimerOutputs

using Base: @lock

# global lock for shared resources (alloc stats, usage limits, etc).
# global lock for shared object dicts (allocated, requested).
# stats are not covered by this and cannot be assumed to be exact.
# each allocator needs to lock its own resources separately too.
const memory_lock = ReentrantLock()

Expand Down Expand Up @@ -48,12 +49,12 @@ called.
"""
alloc_timings() = (show(alloc_to; allocations=false, sortby=:name); println())

const usage = Ref(0)
const usage = Threads.Atomic{Int}(0)
const usage_limit = Ref{Union{Nothing,Int}}(nothing)

const allocated = Dict{CuPtr{Nothing},Mem.DeviceBuffer}()

function actual_alloc(bytes)::Union{Nothing,CuPtr{Nothing}}
function actual_alloc(bytes)
# check the memory allocation limit
if usage_limit[] !== nothing
if usage[] + bytes > usage_limit[]
Expand All @@ -66,6 +67,7 @@ function actual_alloc(bytes)::Union{Nothing,CuPtr{Nothing}}
time = Base.@elapsed begin
@timeit_debug alloc_to "alloc" buf = Mem.alloc(Mem.Device, bytes)
end
Threads.atomic_add!(usage, bytes)
time, buf
catch err
(isa(err, CuError) && err.code == CUDAdrv.ERROR_OUT_OF_MEMORY) || rethrow()
Expand All @@ -74,39 +76,37 @@ function actual_alloc(bytes)::Union{Nothing,CuPtr{Nothing}}
@assert sizeof(buf) == bytes
ptr = convert(CuPtr{Nothing}, buf)

# manage state
# record the buffer
@lock memory_lock begin
alloc_stats.actual_time += time
alloc_stats.actual_nalloc += 1
alloc_stats.actual_alloc += bytes
usage[] += bytes
@assert !haskey(allocated, ptr)
allocated[ptr] = buf
end

alloc_stats.actual_time += time
alloc_stats.actual_nalloc += 1
alloc_stats.actual_alloc += bytes

return ptr
end

function actual_free(ptr::CuPtr{Nothing})
# look up the buffer
buf = @lock memory_lock begin
allocated[ptr]
buf = allocated[ptr]
delete!(allocated, ptr)
buf
end
bytes = sizeof(buf)

# free the memory
@timeit_debug alloc_to "free" begin
time = Base.@elapsed Mem.free(buf)
Threads.atomic_sub!(usage, bytes)
end

# manage state
@lock memory_lock begin
alloc_stats.actual_time += time
alloc_stats.actual_nfree += 1
alloc_stats.actual_free += bytes
usage[] -= bytes
delete!(allocated, ptr)
end
alloc_stats.actual_time += time
alloc_stats.actual_nfree += 1
alloc_stats.actual_free += bytes

return
end
Expand Down Expand Up @@ -148,7 +148,7 @@ const pool = Ref{Module}(BinnedPool)

export OutOfGPUMemoryError

const requested = Dict{CuPtr{Nothing},Int}()
const requested = Dict{CuPtr{Nothing},Vector}()

"""
OutOfGPUMemoryError()
Expand All @@ -171,24 +171,27 @@ end
Allocate a number of bytes `sz` from the memory pool. Returns a `CuPtr{Nothing}`; may throw
a [`OutOfGPUMemoryError`](@ref) if the allocation request cannot be satisfied.
"""
@inline function alloc(sz)::CuPtr{Nothing}
@inline function alloc(sz)
# 0-byte allocations shouldn't hit the pool
sz == 0 && return CU_NULL

time = Base.@elapsed begin
@pool_timeit "pooled alloc" ptr = pool[].alloc(sz)
@pool_timeit "pooled alloc" ptr = pool[].alloc(sz)::Union{Nothing,CuPtr{Nothing}}
end
ptr === nothing && throw(OutOfGPUMemoryError(sz))

# manage state
@lock memory_lock begin
alloc_stats.pool_time += time
alloc_stats.pool_nalloc += 1
alloc_stats.pool_alloc += sz
@assert !haskey(requested, ptr)
requested[ptr] = sz
# record the allocation
if Base.JLOptions().debug_level >= 2
@lock memory_lock begin
@assert !haskey(requested, ptr)
requested[ptr] = backtrace()
end
end

alloc_stats.pool_time += time
alloc_stats.pool_nalloc += 1
alloc_stats.pool_alloc += sz

return ptr
end

Expand All @@ -201,18 +204,20 @@ Releases a buffer pointed to by `ptr` to the memory pool.
# 0-byte allocations shouldn't hit the pool
ptr == CU_NULL && return

# record the allocation
if Base.JLOptions().debug_level >= 2
@lock memory_lock begin
@assert haskey(requested, ptr)
delete!(requested, ptr)
end
end

time = Base.@elapsed begin
@pool_timeit "pooled free" pool[].free(ptr)
end

# manage state
@lock memory_lock begin
alloc_stats.pool_time += time
alloc_stats.pool_nfree += 1
@assert haskey(requested, ptr)
sz = requested[ptr]
delete!(requested, ptr)
end
alloc_stats.pool_time += time
alloc_stats.pool_nfree += 1

return
end
Expand Down Expand Up @@ -370,12 +375,11 @@ function memory_status(io::IO=stdout)
free_bytes, total_bytes = CUDAdrv.Mem.info()
used_bytes = total_bytes - free_bytes
used_ratio = used_bytes / total_bytes

@printf(io, "Effective GPU memory usage: %.2f%% (%s/%s)\n",
100*used_ratio, Base.format_bytes(used_bytes),
Base.format_bytes(total_bytes))

@printf(io, "CuArrays GPU memory usage: %s", Base.format_bytes(usage[]))
@printf(io, "CuArrays allocator usage: %s", Base.format_bytes(usage[]))
if usage_limit[] !== nothing
@printf(io, " (capped at %s)", Base.format_bytes(usage_limit[]))
end
Expand All @@ -384,23 +388,29 @@ function memory_status(io::IO=stdout)
alloc_used_bytes = pool[].used_memory()
alloc_cached_bytes = pool[].cached_memory()
alloc_total_bytes = alloc_used_bytes + alloc_cached_bytes

@printf(io, "%s usage: %s (%s allocated, %s cached)\n", nameof(pool[]),
Base.format_bytes(alloc_total_bytes), Base.format_bytes(alloc_used_bytes),
Base.format_bytes(alloc_cached_bytes))

requested_bytes = reduce(+, values(requested); init=0)

@printf(io, "%s efficiency: %.2f%% (%s requested, %s allocated)\n", nameof(pool[]),
100*requested_bytes/usage[],
Base.format_bytes(requested_bytes),
Base.format_bytes(usage[]))

# check if the memory usage as counted by the CUDA allocator wrapper
# matches what is reported by the pool implementation
discrepancy = usage[] - alloc_total_bytes
discrepancy = abs(usage[] - alloc_total_bytes)
if discrepancy != 0
@debug "Discrepancy of $(Base.format_bytes(discrepancy)) between memory pool and allocator"
println(io, "Discrepancy of $(Base.format_bytes(discrepancy)) between memory pool and allocator!")
end

if Base.JLOptions().debug_level >= 2
@lock memory_lock begin
for (ptr, bt) in requested
buf = allocated[ptr]
@printf(io, "\nOutstanding memory allocation of %s at %p",
Base.format_bytes(sizeof(buf)), Int(ptr))
stack = stacktrace(bt, false)
StackTraces.remove_frames!(stack, :alloc)
Base.show_backtrace(io, stack)
println(io)
end
end
end
end

Expand All @@ -413,21 +423,32 @@ end
Enable the recording of debug timings.
"""
enable_timings() = (TimerOutputs.enable_debug_timings(CuArrays); return)
disable_timings() = (TimerOutputs.disable_debug_timings(CuArrays); return)

function __init_memory__()
if haskey(ENV, "CUARRAYS_MEMORY_LIMIT")
usage_limit[] = parse(Int, ENV["CUARRAYS_MEMORY_LIMIT"])
Base.depwarn("The CUARRAYS_MEMORY_LIMIT environment flag is deprecated, please use JULIA_CUDA_MEMORY_LIMIT instead.", :__init_memory__)
ENV["JULIA_CUDA_MEMORY_LIMIT"] = ENV["CUARRAYS_MEMORY_LIMIT"]
end

if haskey(ENV, "JULIA_CUDA_MEMORY_LIMIT")
usage_limit[] = parse(Int, ENV["JULIA_CUDA_MEMORY_LIMIT"])
end

if haskey(ENV, "CUARRAYS_MEMORY_POOL")
Base.depwarn("The CUARRAYS_MEMORY_POOL environment flag is deprecated, please use JULIA_CUDA_MEMORY_POOL instead.", :__init_memory__)
ENV["JULIA_CUDA_MEMORY_POOL"] = ENV["CUARRAYS_MEMORY_POOL"]
end

if haskey(ENV, "JULIA_CUDA_MEMORY_POOL")
pool[] =
if ENV["CUARRAYS_MEMORY_POOL"] == "binned"
if ENV["JULIA_CUDA_MEMORY_POOL"] == "binned"
BinnedPool
elseif ENV["CUARRAYS_MEMORY_POOL"] == "simple"
elseif ENV["JULIA_CUDA_MEMORY_POOL"] == "simple"
SimplePool
elseif ENV["CUARRAYS_MEMORY_POOL"] == "split"
elseif ENV["JULIA_CUDA_MEMORY_POOL"] == "split"
SplittingPool
elseif ENV["CUARRAYS_MEMORY_POOL"] == "none"
elseif ENV["JULIA_CUDA_MEMORY_POOL"] == "none"
DummyPool
else
error("Invalid allocator selected")
Expand All @@ -436,7 +457,7 @@ function __init_memory__()
pool[].init()

# if the user hand-picked an allocator, be a little verbose
if haskey(ENV, "CUARRAYS_MEMORY_POOL")
if haskey(ENV, "JULIA_CUDA_MEMORY_POOL")
atexit(()->begin
Core.println("""
CuArrays.jl $(nameof(pool[])) statistics:
Expand Down
Loading

0 comments on commit e18e339

Please sign in to comment.