Skip to content
This repository has been archived by the owner on Mar 12, 2021. It is now read-only.

Commit

Permalink
Use a return value-based retry scheme for all APIs failing to allocate.
Browse files Browse the repository at this point in the history
  • Loading branch information
maleadt committed Mar 15, 2020
1 parent e18e339 commit ddfd0ae
Show file tree
Hide file tree
Showing 18 changed files with 42 additions and 66 deletions.
2 changes: 1 addition & 1 deletion src/blas/CUBLAS.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ using CUDAdrv: CUstream
using CUDAnative

using ..CuArrays
using ..CuArrays: libcublas, unsafe_free!
using ..CuArrays: libcublas, unsafe_free!, @retry_reclaim
using LinearAlgebra

using CEnum
Expand Down
2 changes: 1 addition & 1 deletion src/blas/error.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ end

macro check(ex)
quote
res = $(esc(ex))
res = @retry_reclaim CUBLAS_STATUS_ALLOC_FAILED $(esc(ex))
if res != CUBLAS_STATUS_SUCCESS
throw_api_error(res)
end
Expand Down
2 changes: 1 addition & 1 deletion src/dnn/CUDNN.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ using CUDAnative
using CEnum

using ..CuArrays
using ..CuArrays: libcudnn, @argout, @workspace
using ..CuArrays: libcudnn, @argout, @workspace, @retry_reclaim
import ..CuArrays.unsafe_free!

import NNlib
Expand Down
2 changes: 1 addition & 1 deletion src/dnn/error.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ end

macro check(ex)
quote
res = $(esc(ex))
res = @retry_reclaim CUDNN_STATUS_ALLOC_FAILED $(esc(ex))
if res != CUDNN_STATUS_SUCCESS
throw_api_error(res)
end
Expand Down
2 changes: 1 addition & 1 deletion src/fft/CUFFT.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module CUFFT
using CUDAapi

using ..CuArrays
import ..CuArrays: libcufft, unsafe_free!
import ..CuArrays: libcufft, unsafe_free!, @retry_reclaim

using CUDAdrv
using CUDAdrv: CUstream
Expand Down
2 changes: 1 addition & 1 deletion src/fft/error.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ end

macro check(ex)
quote
res = $(esc(ex))
res = @retry_reclaim CUFFT_ALLOC_FAILED $(esc(ex))
if res != CUFFT_SUCCESS
throw_api_error(res)
end
Expand Down
48 changes: 17 additions & 31 deletions src/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -232,48 +232,34 @@ actually reclaimed.
reclaim(sz::Int=typemax(Int)) = pool[].reclaim(sz)

"""
extalloc(f::Function; check::Function=isa(OutOfGPUMemoryError), nb::Integer=typemax(Int))
@retry_reclaim fail ex
Run a function `f` repeatedly until it successfully allocates the memory it needs. Only
out-of-memory exceptions that pass `check` are considered for retry; this defaults to
checking for the CuArrays out-of-memory exception but should be customized as to detect how
an out-of-memory situation is reported by the function `f`. The argument `nb` indicates how
many bytes of memory `f` requires, and serves as a hint for how much memory to reclaim
before trying `f` again.
Run a block of code `ex` repeatedly until it successfully allocates the memory it needs;
Failure to do so indicated by returning `fail`. At each try, more and more memory is freed
from the CuArrays memory pool. When that is not possible anymore, `fail` will be returned.
This function is intended to be used with external functionality that allocates but does not
use the CuArrays memory pool, thus conflicting with its caching behavior.
This macro is intended for use with CUDA APIs, which sometimes allocate (outside of the
CuArrays memory pool) and return a specific error code when failing to.
"""
function extalloc(f::Function; check::Function=ex->isa(ex,OutOfGPUMemoryError), nb::Integer=typemax(Int))
phase = 0
while true
phase += 1
return try
f()
catch ex
check(ex) || rethrow()

# incrementally costly reclaim of more and more memory
macro retry_reclaim(fail, ex)
quote
ret = nothing
for phase in 1:3
ret = $(esc(ex))
ret == $(esc(fail)) || break

# incrementally more costly reclaim of cached memory
if phase == 1
reclaim(nb)
reclaim()
elseif phase == 2
GC.gc(false)
reclaim(nb)
reclaim()
elseif phase == 3
GC.gc(true)
reclaim(nb)
elseif phase == 4
# maybe the user lied, so try reclaiming all memory
GC.gc(true)
reclaim()
else
# give up
rethrow()
end

# try again
continue
end
ret
end
end

Expand Down
2 changes: 1 addition & 1 deletion src/rand/CURAND.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module CURAND

using ..CuArrays
using ..CuArrays: libcurand
using ..CuArrays: libcurand, @retry_reclaim

using CUDAapi

Expand Down
2 changes: 1 addition & 1 deletion src/rand/error.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ end

macro check(ex)
quote
res = $(esc(ex))
res = @retry_reclaim CURAND_STATUS_ALLOCATION_FAILED $(esc(ex))
if res != CURAND_STATUS_SUCCESS
throw_api_error(res)
end
Expand Down
18 changes: 9 additions & 9 deletions src/rand/random.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ mutable struct RNG <: Random.AbstractRNG

function RNG(typ=CURAND_RNG_PSEUDO_DEFAULT)
handle_ref = Ref{curandGenerator_t}()
@allocates curandCreateGenerator(handle_ref, typ)
curandCreateGenerator(handle_ref, typ)

obj = new(handle_ref[], context(), typ)
finalizer(unsafe_destroy!, obj)
Expand All @@ -42,7 +42,7 @@ Base.unsafe_convert(::Type{curandGenerator_t}, rng::RNG) = rng.handle
function Random.seed!(rng::RNG, seed=Base.rand(UInt64), offset=0)
curandSetPseudoRandomGeneratorSeed(rng, seed)
curandSetGeneratorOffset(rng, offset)
@allocates curandGenerateSeeds(rng)
curandGenerateSeeds(rng)
return
end

Expand All @@ -59,11 +59,11 @@ end
const UniformType = Union{Type{Float32},Type{Float64}}
const UniformArray = CuArray{<:Union{Float32,Float64}}
function Random.rand!(rng::RNG, A::CuArray{Float32})
@allocates curandGenerateUniform(rng, A, length(A))
curandGenerateUniform(rng, A, length(A))
return A
end
function Random.rand!(rng::RNG, A::CuArray{Float64})
@allocates curandGenerateUniformDouble(rng, A, length(A))
curandGenerateUniformDouble(rng, A, length(A))
return A
end

Expand All @@ -86,31 +86,31 @@ end
const NormalType = Union{Type{Float32},Type{Float64}}
const NormalArray = CuArray{<:Union{Float32,Float64}}
function Random.randn!(rng::RNG, A::CuArray{Float32}; mean=0, stddev=1)
inplace_pow2(A, B->@allocates(curandGenerateNormal(rng, B, length(B), mean, stddev)))
inplace_pow2(A, B->curandGenerateNormal(rng, B, length(B), mean, stddev))
return A
end
function Random.randn!(rng::RNG, A::CuArray{Float64}; mean=0, stddev=1)
inplace_pow2(A, B->@allocates(curandGenerateNormalDouble(rng, B, length(B), mean, stddev)))
inplace_pow2(A, B->curandGenerateNormalDouble(rng, B, length(B), mean, stddev))
return A
end

# log-normal
const LognormalType = Union{Type{Float32},Type{Float64}}
const LognormalArray = CuArray{<:Union{Float32,Float64}}
function rand_logn!(rng::RNG, A::CuArray{Float32}; mean=0, stddev=1)
inplace_pow2(A, B->@allocates(curandGenerateLogNormal(rng, B, length(B), mean, stddev)))
inplace_pow2(A, B->curandGenerateLogNormal(rng, B, length(B), mean, stddev))
return A
end
function rand_logn!(rng::RNG, A::CuArray{Float64}; mean=0, stddev=1)
inplace_pow2(A, B->@allocates(curandGenerateLogNormalDouble(rng, B, length(B), mean, stddev)))
inplace_pow2(A, B->curandGenerateLogNormalDouble(rng, B, length(B), mean, stddev))
return A
end

# poisson
const PoissonType = Union{Type{Cuint}}
const PoissonArray = CuArray{Cuint}
function rand_poisson!(rng::RNG, A::CuArray{Cuint}; lambda=1)
@allocates curandGeneratePoisson(rng, A, length(A), lambda)
curandGeneratePoisson(rng, A, length(A), lambda)
return A
end

Expand Down
9 changes: 0 additions & 9 deletions src/rand/wrappers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,3 @@ end
version() = VersionNumber(curandGetProperty(CUDAapi.MAJOR_VERSION),
curandGetProperty(CUDAapi.MINOR_VERSION),
curandGetProperty(CUDAapi.PATCH_LEVEL))

macro allocates(ex)
quote
CuArrays.extalloc(check=err->isa(err, CURANDError) &&
err.code == CURAND_STATUS_ALLOCATION_FAILED) do
$(esc(ex))
end
end
end
2 changes: 1 addition & 1 deletion src/solver/CUSOLVER.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module CUSOLVER

using ..CuArrays
using ..CuArrays: libcusolver, @allowscalar, unsafe_free!, @argout, @workspace
using ..CuArrays: libcusolver, @allowscalar, unsafe_free!, @argout, @workspace, @retry_reclaim

using ..CUBLAS: cublasFillMode_t, cublasOperation_t, cublasSideMode_t, cublasDiagType_t
using ..CUSPARSE: cusparseMatDescr_t
Expand Down
2 changes: 1 addition & 1 deletion src/solver/error.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ end

macro check(ex)
quote
res = $(esc(ex))
res = @retry_reclaim CUSOLVER_STATUS_ALLOC_FAILED $(esc(ex))
if res != CUSOLVER_STATUS_SUCCESS
throw_api_error(res)
end
Expand Down
2 changes: 1 addition & 1 deletion src/sparse/CUSPARSE.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module CUSPARSE

using ..CuArrays
using ..CuArrays: libcusparse, unsafe_free!, @argout, @workspace
using ..CuArrays: libcusparse, unsafe_free!, @argout, @workspace, @retry_reclaim

using CUDAapi

Expand Down
2 changes: 1 addition & 1 deletion src/sparse/error.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ end

macro check(ex)
quote
res = $(esc(ex))
res = @retry_reclaim CUSPARSE_STATUS_ALLOC_FAILED $(esc(ex))
if res != CUSPARSE_STATUS_SUCCESS
throw_api_error(res)
end
Expand Down
2 changes: 1 addition & 1 deletion src/tensor/CUTENSOR.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module CUTENSOR

using ..CuArrays
using ..CuArrays: libcutensor, @argout, @workspace
using ..CuArrays: libcutensor, @argout, @workspace, @retry_reclaim

using CUDAapi

Expand Down
2 changes: 1 addition & 1 deletion src/tensor/error.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ end

macro check(ex)
quote
res = $(esc(ex))
res = @retry_reclaim CUTENSOR_STATUS_ALLOC_FAILED $(esc(ex))
if res != CUTENSOR_STATUS_SUCCESS
throw_api_error(res)
end
Expand Down
5 changes: 2 additions & 3 deletions test/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
CuArrays.reclaim(1024)
CuArrays.reclaim()

CuArrays.extalloc(()->())
CuArrays.extalloc(()->(); check=ex->true)
CuArrays.extalloc(()->(); nb=1)
@test CuArrays.@retry_reclaim(42, return 42) == 42
@test CuArrays.@retry_reclaim(42, return 41) == 41

@test_throws OutOfGPUMemoryError CuArray{Int}(undef, 10^20)
@test_throws OutOfGPUMemoryError CuArrays.extalloc() do
Expand Down

0 comments on commit ddfd0ae

Please sign in to comment.