Skip to content

Commit

Permalink
Add option not to abort on cuda OOM
Browse files Browse the repository at this point in the history
Warning: Not ready for merge.
Add option not to abort on cuda OOM but throw/return a ggml_status.
The goal in this ticket is NOT to be able to continue inference when
OOM, but just to do a clean controlled exit at higher level.
No change to default behavior (abort).
Retouch ggml_tallocr_alloc to return a ggml_status.
Ass a new unit test to check the no abort flow (skiped if the envvar
GGML_CUDA_NO_ABORT is not set).
  • Loading branch information
WilliamTambellini committed Feb 13, 2025
1 parent 9a4acb3 commit 5c4d9b6
Show file tree
Hide file tree
Showing 14 changed files with 314 additions and 22 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
build/
release/
debug/
build-*/
out/
tmp/
Expand Down
4 changes: 2 additions & 2 deletions include/ggml-alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
// call with a worst-case graph to avoid buffer reallocations
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
// returns false if the buffer allocation failed
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
GGML_API bool ggml_gallocr_reserve_n(
GGML_API enum ggml_status ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_gallocr_reserve_n(
ggml_gallocr_t galloc,
struct ggml_cgraph * graph,
const int * node_buffer_ids,
Expand Down
17 changes: 11 additions & 6 deletions src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
size = GGML_PAD(size, talloc->alignment);

if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate tensor '%s' (needed %zu, available %zu)\n",
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
GGML_ABORT("not enough space in the buffer");
}
Expand Down Expand Up @@ -378,6 +378,7 @@ struct ggml_gallocr {
};

ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
//GGML_LOG_TRACE("%s: nbufs=%d\n", __func__, n_bufs);
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
GGML_ASSERT(galloc != NULL);

Expand Down Expand Up @@ -670,7 +671,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
}
}

bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
enum ggml_status ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
//GGML_LOG_DEBUG("%s: \n", __func__);
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
// add 25% margin to avoid hash collisions
min_hash_size += min_hash_size / 4;
Expand Down Expand Up @@ -771,16 +773,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
if (galloc->buffers[i] == NULL) {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
return false;
return GGML_STATUS_ALLOC_FAILED;
}
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
}
}

return true;
return GGML_STATUS_SUCCESS;
}

bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
enum ggml_status ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
}

Expand Down Expand Up @@ -865,13 +867,16 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
return false;
}

// Check with reviewers: any cons to return a ggml_status here?
bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
if (ggml_gallocr_needs_realloc(galloc, graph)) {
if (galloc->n_buffers == 1) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
#endif
if (!ggml_gallocr_reserve(galloc, graph)) {
enum ggml_status s = ggml_gallocr_reserve(galloc, graph);
if (s != GGML_STATUS_SUCCESS) {
GGML_LOG_INFO("%s: ggml_gallocr_reserve failed to reserve. status=%d \n", __func__, s);
return false;
}
} else {
Expand Down
22 changes: 19 additions & 3 deletions src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,14 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
// return a dummy buffer for zero-sized allocations
return ggml_backend_buffer_init(buft, {}, NULL, 0);
}

return buft->iface.alloc_buffer(buft, size);
ggml_backend_buffer_t b = NULL;
try {
b = buft->iface.alloc_buffer(buft, size);
} catch (const std::exception &e) {
GGML_LOG_ERROR("%s: iface.alloc_buffer failed: %s \n", __func__, e.what());
return NULL;
}
return b;
}

size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
Expand Down Expand Up @@ -172,6 +178,7 @@ enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer
}

ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
assert(buffer);
return buffer->buft;
}

Expand Down Expand Up @@ -329,7 +336,16 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
}

enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
return backend->iface.graph_compute(backend, cgraph);
ggml_status s;
try {
s = backend->iface.graph_compute(backend, cgraph);
} catch(std::bad_alloc &e) {
return GGML_STATUS_ALLOC_FAILED;
} catch (std::exception &e) {
GGML_LOG_INFO("%s: graph_compute threw: %s", __func__, e.what());
return GGML_STATUS_FAILED;
}
return s;
}

bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
Expand Down
1 change: 1 addition & 0 deletions src/ggml-cpu/amx/amx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, st
tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);

GGML_UNUSED(buffer);
return GGML_STATUS_SUCCESS;
}

static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
Expand Down
5 changes: 3 additions & 2 deletions src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ static int ggml_cuda_highest_compiled_arch(const int arch) {

#define GGML_CUDA_MAX_STREAMS 8

[[noreturn]]
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
// Print the error. Will also either abort or throw an exception.
[[noreturn]] void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);

#define CUDA_CHECK_GEN(err, success, error_fn) \
do { \
Expand Down Expand Up @@ -162,6 +162,7 @@ static const char * cu_get_error_str(CUresult err) {
cuGetErrorString(err, &err_str);
return err_str;
}
// Will print error and abort/throw
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
#endif

Expand Down
22 changes: 18 additions & 4 deletions src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,13 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
GGML_LOG_ERROR(" %s\n", stmt);
// abort with GGML_ABORT to get a stack trace
GGML_ABORT(GGML_CUDA_NAME " error");
static const char* GGML_CUDA_NO_ABORT = getenv("GGML_CUDA_NO_ABORT");
if (!GGML_CUDA_NO_ABORT) {
GGML_ABORT(GGML_CUDA_NAME " error");
}
#ifndef __CUDA_ARCH__
throw std::runtime_error(msg);
#endif
}

// this is faster on Windows
Expand All @@ -92,6 +98,7 @@ int ggml_cuda_get_device() {
return id;
}

// Note: Does not abort/throw because does not use CUDA_CHECK
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device);
#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
Expand Down Expand Up @@ -536,7 +543,8 @@ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {

static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

GGML_ASSERT(tensor);
GGML_LOG_DEBUG("%s: t=%p %s\n", __func__, tensor, tensor->name);
if (tensor->view_src != NULL) {
assert(tensor->view_src->buffer->buft == buffer->buft);
return;
Expand Down Expand Up @@ -945,8 +953,14 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
// as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();

return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
ggml_backend_buffer_t b = NULL;
try {
b = ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
} catch (std::exception &e) {
GGML_LOG_ERROR("%s: ggml_backend_buffer_init threw: %s \n", __func__, e.what());
return NULL;
}
return b;
}

static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
Expand Down
5 changes: 5 additions & 0 deletions src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1681,6 +1681,7 @@ void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes) {
}

struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
assert(src);
return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
}

Expand Down Expand Up @@ -2328,6 +2329,8 @@ struct ggml_tensor * ggml_concat(
struct ggml_tensor * b,
int dim) {
GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
assert(a);
assert(b);

int64_t ne[GGML_MAX_DIMS];
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
Expand Down Expand Up @@ -2695,6 +2698,8 @@ struct ggml_tensor * ggml_mul_mat(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b) {
assert(a);
assert(b);
GGML_ASSERT(ggml_can_mul_mat(a, b));
GGML_ASSERT(!ggml_is_transposed(a));

Expand Down
6 changes: 6 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -412,3 +412,9 @@ add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")

set(TEST_TARGET test-oom)
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
2 changes: 1 addition & 1 deletion tests/test-arange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ int main(int /*argc*/, const char** /*argv*/) {
ggml_backend_cpu_set_n_threads(backend, n_threads);
}

ggml_backend_graph_compute(backend, graph);
GGML_ASSERT(ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS);

float * output = new float[ggml_nelements(t)];
ggml_backend_tensor_get(t, output, 0, ggml_nbytes(t));
Expand Down
4 changes: 3 additions & 1 deletion tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,9 @@ struct test_case {
ggml_build_forward_expand(gf, out);

// warmup run
ggml_backend_graph_compute(backend, gf);
ggml_status status = ggml_backend_graph_compute(backend, gf);
if (status != GGML_STATUS_SUCCESS)
printf("Warning: ggml_backend_graph_compute warmup failed: ggml status=%d \n", status);

// determine number of runs
int n_runs;
Expand Down
9 changes: 7 additions & 2 deletions tests/test-mul-mat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,9 @@ struct ggml_tensor* compute(const test_model & model, ggml_gallocr_t allocr) {
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
}


ggml_backend_graph_compute(model.backend, gf);
ggml_status status = ggml_backend_graph_compute(model.backend, gf);
if (status != GGML_STATUS_SUCCESS)
return nullptr;

//ggml_graph_print(gf);

Expand Down Expand Up @@ -313,6 +314,10 @@ int main(void)
}

struct ggml_tensor * result = compute(model, allocr);
if (!result) {
printf("ggml_mul_mat: failed to compute graph");
return EXIT_FAILURE;
}

std::vector<float> out_data(ggml_nelements(result));

Expand Down
Loading

0 comments on commit 5c4d9b6

Please sign in to comment.