From ddfd0aec997a2155c63004d08d54f9dafc93177b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Sat, 14 Mar 2020 17:14:44 +0100 Subject: [PATCH] Use a return value-based retry scheme for all APIs failing to allocate. --- src/blas/CUBLAS.jl | 2 +- src/blas/error.jl | 2 +- src/dnn/CUDNN.jl | 2 +- src/dnn/error.jl | 2 +- src/fft/CUFFT.jl | 2 +- src/fft/error.jl | 2 +- src/memory.jl | 48 +++++++++++++++--------------------------- src/rand/CURAND.jl | 2 +- src/rand/error.jl | 2 +- src/rand/random.jl | 18 ++++++++-------- src/rand/wrappers.jl | 9 -------- src/solver/CUSOLVER.jl | 2 +- src/solver/error.jl | 2 +- src/sparse/CUSPARSE.jl | 2 +- src/sparse/error.jl | 2 +- src/tensor/CUTENSOR.jl | 2 +- src/tensor/error.jl | 2 +- test/memory.jl | 5 ++--- 18 files changed, 42 insertions(+), 66 deletions(-) diff --git a/src/blas/CUBLAS.jl b/src/blas/CUBLAS.jl index 26fca209..9b7f74d3 100644 --- a/src/blas/CUBLAS.jl +++ b/src/blas/CUBLAS.jl @@ -8,7 +8,7 @@ using CUDAdrv: CUstream using CUDAnative using ..CuArrays -using ..CuArrays: libcublas, unsafe_free! +using ..CuArrays: libcublas, unsafe_free!, @retry_reclaim using LinearAlgebra using CEnum diff --git a/src/blas/error.jl b/src/blas/error.jl index 2f522a20..35d14d43 100644 --- a/src/blas/error.jl +++ b/src/blas/error.jl @@ -54,7 +54,7 @@ end macro check(ex) quote - res = $(esc(ex)) + res = @retry_reclaim CUBLAS_STATUS_ALLOC_FAILED $(esc(ex)) if res != CUBLAS_STATUS_SUCCESS throw_api_error(res) end diff --git a/src/dnn/CUDNN.jl b/src/dnn/CUDNN.jl index 49069807..fbd60d5c 100644 --- a/src/dnn/CUDNN.jl +++ b/src/dnn/CUDNN.jl @@ -11,7 +11,7 @@ using CUDAnative using CEnum using ..CuArrays -using ..CuArrays: libcudnn, @argout, @workspace +using ..CuArrays: libcudnn, @argout, @workspace, @retry_reclaim import ..CuArrays.unsafe_free! import NNlib diff --git a/src/dnn/error.jl b/src/dnn/error.jl index b53a7f23..6bdf75c9 100644 --- a/src/dnn/error.jl +++ b/src/dnn/error.jl @@ -26,7 +26,7 @@ end macro check(ex) quote - res = $(esc(ex)) + res = @retry_reclaim CUDNN_STATUS_ALLOC_FAILED $(esc(ex)) if res != CUDNN_STATUS_SUCCESS throw_api_error(res) end diff --git a/src/fft/CUFFT.jl b/src/fft/CUFFT.jl index 86724f92..aec4447d 100644 --- a/src/fft/CUFFT.jl +++ b/src/fft/CUFFT.jl @@ -3,7 +3,7 @@ module CUFFT using CUDAapi using ..CuArrays -import ..CuArrays: libcufft, unsafe_free! +import ..CuArrays: libcufft, unsafe_free!, @retry_reclaim using CUDAdrv using CUDAdrv: CUstream diff --git a/src/fft/error.jl b/src/fft/error.jl index f9b7fad1..085d233e 100644 --- a/src/fft/error.jl +++ b/src/fft/error.jl @@ -68,7 +68,7 @@ end macro check(ex) quote - res = $(esc(ex)) + res = @retry_reclaim CUFFT_ALLOC_FAILED $(esc(ex)) if res != CUFFT_SUCCESS throw_api_error(res) end diff --git a/src/memory.jl b/src/memory.jl index 6238db5e..4f9127e3 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -232,48 +232,34 @@ actually reclaimed. reclaim(sz::Int=typemax(Int)) = pool[].reclaim(sz) """ - extalloc(f::Function; check::Function=isa(OutOfGPUMemoryError), nb::Integer=typemax(Int)) + @retry_reclaim fail ex -Run a function `f` repeatedly until it successfully allocates the memory it needs. Only -out-of-memory exceptions that pass `check` are considered for retry; this defaults to -checking for the CuArrays out-of-memory exception but should be customized as to detect how -an out-of-memory situation is reported by the function `f`. The argument `nb` indicates how -many bytes of memory `f` requires, and serves as a hint for how much memory to reclaim -before trying `f` again. +Run a block of code `ex` repeatedly until it successfully allocates the memory it needs; +Failure to do so indicated by returning `fail`. At each try, more and more memory is freed +from the CuArrays memory pool. When that is not possible anymore, `fail` will be returned. -This function is intended to be used with external functionality that allocates but does not -use the CuArrays memory pool, thus conflicting with its caching behavior. +This macro is intended for use with CUDA APIs, which sometimes allocate (outside of the +CuArrays memory pool) and return a specific error code when failing to. """ -function extalloc(f::Function; check::Function=ex->isa(ex,OutOfGPUMemoryError), nb::Integer=typemax(Int)) - phase = 0 - while true - phase += 1 - return try - f() - catch ex - check(ex) || rethrow() - - # incrementally costly reclaim of more and more memory +macro retry_reclaim(fail, ex) + quote + ret = nothing + for phase in 1:3 + ret = $(esc(ex)) + ret == $(esc(fail)) || break + + # incrementally more costly reclaim of cached memory if phase == 1 - reclaim(nb) + reclaim() elseif phase == 2 GC.gc(false) - reclaim(nb) + reclaim() elseif phase == 3 - GC.gc(true) - reclaim(nb) - elseif phase == 4 - # maybe the user lied, so try reclaiming all memory GC.gc(true) reclaim() - else - # give up - rethrow() end - - # try again - continue end + ret end end diff --git a/src/rand/CURAND.jl b/src/rand/CURAND.jl index 4af5130c..76aef3d1 100644 --- a/src/rand/CURAND.jl +++ b/src/rand/CURAND.jl @@ -1,7 +1,7 @@ module CURAND using ..CuArrays -using ..CuArrays: libcurand +using ..CuArrays: libcurand, @retry_reclaim using CUDAapi diff --git a/src/rand/error.jl b/src/rand/error.jl index 2d1ec876..5d68aab1 100644 --- a/src/rand/error.jl +++ b/src/rand/error.jl @@ -60,7 +60,7 @@ end macro check(ex) quote - res = $(esc(ex)) + res = @retry_reclaim CURAND_STATUS_ALLOCATION_FAILED $(esc(ex)) if res != CURAND_STATUS_SUCCESS throw_api_error(res) end diff --git a/src/rand/random.jl b/src/rand/random.jl index 6b947ea8..d0bb3242 100644 --- a/src/rand/random.jl +++ b/src/rand/random.jl @@ -20,7 +20,7 @@ mutable struct RNG <: Random.AbstractRNG function RNG(typ=CURAND_RNG_PSEUDO_DEFAULT) handle_ref = Ref{curandGenerator_t}() - @allocates curandCreateGenerator(handle_ref, typ) + curandCreateGenerator(handle_ref, typ) obj = new(handle_ref[], context(), typ) finalizer(unsafe_destroy!, obj) @@ -42,7 +42,7 @@ Base.unsafe_convert(::Type{curandGenerator_t}, rng::RNG) = rng.handle function Random.seed!(rng::RNG, seed=Base.rand(UInt64), offset=0) curandSetPseudoRandomGeneratorSeed(rng, seed) curandSetGeneratorOffset(rng, offset) - @allocates curandGenerateSeeds(rng) + curandGenerateSeeds(rng) return end @@ -59,11 +59,11 @@ end const UniformType = Union{Type{Float32},Type{Float64}} const UniformArray = CuArray{<:Union{Float32,Float64}} function Random.rand!(rng::RNG, A::CuArray{Float32}) - @allocates curandGenerateUniform(rng, A, length(A)) + curandGenerateUniform(rng, A, length(A)) return A end function Random.rand!(rng::RNG, A::CuArray{Float64}) - @allocates curandGenerateUniformDouble(rng, A, length(A)) + curandGenerateUniformDouble(rng, A, length(A)) return A end @@ -86,11 +86,11 @@ end const NormalType = Union{Type{Float32},Type{Float64}} const NormalArray = CuArray{<:Union{Float32,Float64}} function Random.randn!(rng::RNG, A::CuArray{Float32}; mean=0, stddev=1) - inplace_pow2(A, B->@allocates(curandGenerateNormal(rng, B, length(B), mean, stddev))) + inplace_pow2(A, B->curandGenerateNormal(rng, B, length(B), mean, stddev)) return A end function Random.randn!(rng::RNG, A::CuArray{Float64}; mean=0, stddev=1) - inplace_pow2(A, B->@allocates(curandGenerateNormalDouble(rng, B, length(B), mean, stddev))) + inplace_pow2(A, B->curandGenerateNormalDouble(rng, B, length(B), mean, stddev)) return A end @@ -98,11 +98,11 @@ end const LognormalType = Union{Type{Float32},Type{Float64}} const LognormalArray = CuArray{<:Union{Float32,Float64}} function rand_logn!(rng::RNG, A::CuArray{Float32}; mean=0, stddev=1) - inplace_pow2(A, B->@allocates(curandGenerateLogNormal(rng, B, length(B), mean, stddev))) + inplace_pow2(A, B->curandGenerateLogNormal(rng, B, length(B), mean, stddev)) return A end function rand_logn!(rng::RNG, A::CuArray{Float64}; mean=0, stddev=1) - inplace_pow2(A, B->@allocates(curandGenerateLogNormalDouble(rng, B, length(B), mean, stddev))) + inplace_pow2(A, B->curandGenerateLogNormalDouble(rng, B, length(B), mean, stddev)) return A end @@ -110,7 +110,7 @@ end const PoissonType = Union{Type{Cuint}} const PoissonArray = CuArray{Cuint} function rand_poisson!(rng::RNG, A::CuArray{Cuint}; lambda=1) - @allocates curandGeneratePoisson(rng, A, length(A), lambda) + curandGeneratePoisson(rng, A, length(A), lambda) return A end diff --git a/src/rand/wrappers.jl b/src/rand/wrappers.jl index 273be3bc..312c949d 100644 --- a/src/rand/wrappers.jl +++ b/src/rand/wrappers.jl @@ -9,12 +9,3 @@ end version() = VersionNumber(curandGetProperty(CUDAapi.MAJOR_VERSION), curandGetProperty(CUDAapi.MINOR_VERSION), curandGetProperty(CUDAapi.PATCH_LEVEL)) - -macro allocates(ex) - quote - CuArrays.extalloc(check=err->isa(err, CURANDError) && - err.code == CURAND_STATUS_ALLOCATION_FAILED) do - $(esc(ex)) - end - end -end diff --git a/src/solver/CUSOLVER.jl b/src/solver/CUSOLVER.jl index 34c2d7c7..9f13658e 100644 --- a/src/solver/CUSOLVER.jl +++ b/src/solver/CUSOLVER.jl @@ -1,7 +1,7 @@ module CUSOLVER using ..CuArrays -using ..CuArrays: libcusolver, @allowscalar, unsafe_free!, @argout, @workspace +using ..CuArrays: libcusolver, @allowscalar, unsafe_free!, @argout, @workspace, @retry_reclaim using ..CUBLAS: cublasFillMode_t, cublasOperation_t, cublasSideMode_t, cublasDiagType_t using ..CUSPARSE: cusparseMatDescr_t diff --git a/src/solver/error.jl b/src/solver/error.jl index b5b0b584..7cd3e406 100644 --- a/src/solver/error.jl +++ b/src/solver/error.jl @@ -50,7 +50,7 @@ end macro check(ex) quote - res = $(esc(ex)) + res = @retry_reclaim CUSOLVER_STATUS_ALLOC_FAILED $(esc(ex)) if res != CUSOLVER_STATUS_SUCCESS throw_api_error(res) end diff --git a/src/sparse/CUSPARSE.jl b/src/sparse/CUSPARSE.jl index 4bf4139a..8f24bd61 100644 --- a/src/sparse/CUSPARSE.jl +++ b/src/sparse/CUSPARSE.jl @@ -1,7 +1,7 @@ module CUSPARSE using ..CuArrays -using ..CuArrays: libcusparse, unsafe_free!, @argout, @workspace +using ..CuArrays: libcusparse, unsafe_free!, @argout, @workspace, @retry_reclaim using CUDAapi diff --git a/src/sparse/error.jl b/src/sparse/error.jl index 59fb072a..1e65ac1e 100644 --- a/src/sparse/error.jl +++ b/src/sparse/error.jl @@ -28,7 +28,7 @@ end macro check(ex) quote - res = $(esc(ex)) + res = @retry_reclaim CUSPARSE_STATUS_ALLOC_FAILED $(esc(ex)) if res != CUSPARSE_STATUS_SUCCESS throw_api_error(res) end diff --git a/src/tensor/CUTENSOR.jl b/src/tensor/CUTENSOR.jl index f1f4568b..d32f0709 100644 --- a/src/tensor/CUTENSOR.jl +++ b/src/tensor/CUTENSOR.jl @@ -1,7 +1,7 @@ module CUTENSOR using ..CuArrays -using ..CuArrays: libcutensor, @argout, @workspace +using ..CuArrays: libcutensor, @argout, @workspace, @retry_reclaim using CUDAapi diff --git a/src/tensor/error.jl b/src/tensor/error.jl index dc975fd8..8016a380 100644 --- a/src/tensor/error.jl +++ b/src/tensor/error.jl @@ -62,7 +62,7 @@ end macro check(ex) quote - res = $(esc(ex)) + res = @retry_reclaim CUTENSOR_STATUS_ALLOC_FAILED $(esc(ex)) if res != CUTENSOR_STATUS_SUCCESS throw_api_error(res) end diff --git a/test/memory.jl b/test/memory.jl index 93663d16..00208c5c 100644 --- a/test/memory.jl +++ b/test/memory.jl @@ -3,9 +3,8 @@ CuArrays.reclaim(1024) CuArrays.reclaim() -CuArrays.extalloc(()->()) -CuArrays.extalloc(()->(); check=ex->true) -CuArrays.extalloc(()->(); nb=1) +@test CuArrays.@retry_reclaim(42, return 42) == 42 +@test CuArrays.@retry_reclaim(42, return 41) == 41 @test_throws OutOfGPUMemoryError CuArray{Int}(undef, 10^20) @test_throws OutOfGPUMemoryError CuArrays.extalloc() do