Add option not to abort on cuda OOM

Warning: Not ready for merge. Add option not to abort on cuda OOM but throw/return a ggml_status. The goal in this ticket is NOT to be able to continue inference when OOM, but just to do a clean controlled exit at higher level. No change to default behavior (abort). Retouch ggml_tallocr_alloc to return a ggml_status. Ass a new unit test to check the no abort flow (skiped if the envvar GGML_CUDA_NO_ABORT is not set).
ggml-org · Feb 13, 2025 · 5c4d9b6 · 5c4d9b6
1 parent 9a4acb3
commit 5c4d9b6
Show file tree

Hide file tree

Showing 14 changed files with 314 additions and 22 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
 build/
+release/
+debug/
 build-*/
 out/
 tmp/

diff --git a/include/ggml-alloc.h b/include/ggml-alloc.h
@@ -53,8 +53,8 @@ GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
 // call with a worst-case graph to avoid buffer reallocations
 // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
 // returns false if the buffer allocation failed
-GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
-GGML_API bool ggml_gallocr_reserve_n(
+GGML_API enum ggml_status ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API enum ggml_status ggml_gallocr_reserve_n(
     ggml_gallocr_t galloc,
     struct ggml_cgraph * graph,
     const int * node_buffer_ids,

diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c
@@ -94,7 +94,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
     size = GGML_PAD(size, talloc->alignment);
 
     if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
-        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
+        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate tensor '%s' (needed %zu, available %zu)\n",
                 __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
         GGML_ABORT("not enough space in the buffer");
     }
@@ -378,6 +378,7 @@ struct ggml_gallocr {
 };
 
 ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
+    //GGML_LOG_TRACE("%s: nbufs=%d\n", __func__, n_bufs);
     ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
     GGML_ASSERT(galloc != NULL);
 
@@ -670,7 +671,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
     }
 }
 
-bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+enum ggml_status ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+    //GGML_LOG_DEBUG("%s:  \n", __func__);
     size_t min_hash_size = graph->n_nodes + graph->n_leafs;
     // add 25% margin to avoid hash collisions
     min_hash_size += min_hash_size / 4;
@@ -771,16 +773,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
             if (galloc->buffers[i] == NULL) {
                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
-                return false;
+                return GGML_STATUS_ALLOC_FAILED;
             }
             ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
         }
     }
 
-    return true;
+    return GGML_STATUS_SUCCESS;
 }
 
-bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
+enum ggml_status ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
     return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
 }
 
@@ -865,13 +867,16 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
     return false;
 }
 
+// Check with reviewers: any cons to return a ggml_status here?
 bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
     if (ggml_gallocr_needs_realloc(galloc, graph)) {
         if (galloc->n_buffers == 1) {
 #ifndef NDEBUG
             GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
 #endif
-            if (!ggml_gallocr_reserve(galloc, graph)) {
+            enum ggml_status s = ggml_gallocr_reserve(galloc, graph);
+            if (s != GGML_STATUS_SUCCESS) {
+                GGML_LOG_INFO("%s: ggml_gallocr_reserve failed to reserve. status=%d \n", __func__, s);
                 return false;
             }
         } else {

diff --git a/src/ggml-backend.cpp b/src/ggml-backend.cpp
@@ -39,8 +39,14 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
         // return a dummy buffer for zero-sized allocations
         return ggml_backend_buffer_init(buft, {}, NULL, 0);
     }
-
-    return buft->iface.alloc_buffer(buft, size);
+    ggml_backend_buffer_t b = NULL;
+    try {
+      b = buft->iface.alloc_buffer(buft, size);
+    }  catch (const std::exception &e) {
+        GGML_LOG_ERROR("%s: iface.alloc_buffer failed: %s \n", __func__, e.what());
+        return NULL;
+    }
+    return b;
 }
 
 size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -172,6 +178,7 @@ enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer
 }
 
 ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
+    assert(buffer);
     return buffer->buft;
 }
 
@@ -329,7 +336,16 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
 }
 
 enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    return backend->iface.graph_compute(backend, cgraph);
+    ggml_status s;
+    try {
+        s = backend->iface.graph_compute(backend, cgraph);
+    } catch(std::bad_alloc &e) {
+        return GGML_STATUS_ALLOC_FAILED;
+    }  catch (std::exception &e) {
+        GGML_LOG_INFO("%s: graph_compute threw: %s", __func__, e.what());
+        return  GGML_STATUS_FAILED;
+    }
+    return s;
 }
 
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {

diff --git a/src/ggml-cpu/amx/amx.cpp b/src/ggml-cpu/amx/amx.cpp
@@ -54,6 +54,7 @@ static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, st
     tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
 
     GGML_UNUSED(buffer);
+    return GGML_STATUS_SUCCESS;
 }
 
 static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,

diff --git a/src/ggml-cuda/common.cuh b/src/ggml-cuda/common.cuh
@@ -120,8 +120,8 @@ static int ggml_cuda_highest_compiled_arch(const int arch) {
 
 #define GGML_CUDA_MAX_STREAMS 8
 
-[[noreturn]]
-void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
+// Print the error. Will also either abort or throw an exception.
+[[noreturn]] void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
 
 #define CUDA_CHECK_GEN(err, success, error_fn)                                      \
      do {                                                                           \
@@ -162,6 +162,7 @@ static const char * cu_get_error_str(CUresult err) {
     cuGetErrorString(err, &err_str);
     return err_str;
 }
+// Will print error and abort/throw
 #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
 #endif
 

diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu
@@ -70,7 +70,13 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
     GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
     GGML_LOG_ERROR("  %s\n", stmt);
     // abort with GGML_ABORT to get a stack trace
-    GGML_ABORT(GGML_CUDA_NAME " error");
+    static const char* GGML_CUDA_NO_ABORT = getenv("GGML_CUDA_NO_ABORT");
+    if (!GGML_CUDA_NO_ABORT) {
+        GGML_ABORT(GGML_CUDA_NAME " error");
+    }
+#ifndef __CUDA_ARCH__
+    throw std::runtime_error(msg);
+#endif
 }
 
 // this is faster on Windows
@@ -92,6 +98,7 @@ int ggml_cuda_get_device() {
     return id;
 }
 
+// Note: Does not abort/throw because does not use CUDA_CHECK
 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
     ggml_cuda_set_device(device);
 #if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
@@ -536,7 +543,8 @@ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
 
 static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
+    GGML_ASSERT(tensor);
+    GGML_LOG_DEBUG("%s: t=%p %s\n", __func__, tensor, tensor->name);
     if (tensor->view_src != NULL) {
         assert(tensor->view_src->buffer->buft == buffer->buft);
         return;
@@ -945,8 +953,14 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg
     // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
     // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
     ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
-
-    return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
+    ggml_backend_buffer_t b = NULL;
+    try {
+        b = ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
+    }  catch (std::exception &e) {
+        GGML_LOG_ERROR("%s: ggml_backend_buffer_init threw: %s \n", __func__, e.what());
+        return NULL;
+    }
+    return b;
 }
 
 static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {

diff --git a/src/ggml.c b/src/ggml.c
@@ -1681,6 +1681,7 @@ void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes) {
 }
 
 struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
+    assert(src);
     return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
 }
 
@@ -2328,6 +2329,8 @@ struct ggml_tensor * ggml_concat(
     struct ggml_tensor  * b,
     int                   dim) {
     GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
+    assert(a);
+    assert(b);
 
     int64_t ne[GGML_MAX_DIMS];
     for (int d = 0; d < GGML_MAX_DIMS; ++d) {
@@ -2695,6 +2698,8 @@ struct ggml_tensor * ggml_mul_mat(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b) {
+    assert(a);
+    assert(b);
     GGML_ASSERT(ggml_can_mul_mat(a, b));
     GGML_ASSERT(!ggml_is_transposed(a));
 

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -412,3 +412,9 @@ add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml)
 add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
 set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+set(TEST_TARGET test-oom)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
diff --git a/tests/test-arange.cpp b/tests/test-arange.cpp
@@ -76,7 +76,7 @@ int main(int /*argc*/, const char** /*argv*/) {
             ggml_backend_cpu_set_n_threads(backend, n_threads);
         }
 
-        ggml_backend_graph_compute(backend, graph);
+        GGML_ASSERT(ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS);
 
         float * output = new float[ggml_nelements(t)];
         ggml_backend_tensor_get(t, output, 0, ggml_nbytes(t));

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -633,7 +633,9 @@ struct test_case {
         ggml_build_forward_expand(gf, out);
 
         // warmup run
-        ggml_backend_graph_compute(backend, gf);
+        ggml_status status = ggml_backend_graph_compute(backend, gf);
+        if (status != GGML_STATUS_SUCCESS)
+            printf("Warning: ggml_backend_graph_compute warmup failed: ggml status=%d \n", status);
 
         // determine number of runs
         int n_runs;

diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp
@@ -151,8 +151,9 @@ struct ggml_tensor* compute(const test_model & model, ggml_gallocr_t allocr) {
         ggml_backend_cpu_set_n_threads(model.backend, n_threads);
     }
 
-
-    ggml_backend_graph_compute(model.backend, gf);
+    ggml_status status = ggml_backend_graph_compute(model.backend, gf);
+    if (status != GGML_STATUS_SUCCESS)
+        return nullptr;
 
     //ggml_graph_print(gf);
 
@@ -313,6 +314,10 @@ int main(void)
     }
 
     struct ggml_tensor * result = compute(model, allocr);
+    if (!result) {
+        printf("ggml_mul_mat: failed to compute graph");
+        return EXIT_FAILURE;
+    }
 
     std::vector<float> out_data(ggml_nelements(result));