diff --git a/.gitignore b/.gitignore
index 4bf0fa095..c86edad24 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
 build/
+release/
+debug/
 build-*/
 out/
 tmp/
diff --git a/include/ggml-alloc.h b/include/ggml-alloc.h
index 23600eea9..700e7a529 100644
--- a/include/ggml-alloc.h
+++ b/include/ggml-alloc.h
@@ -53,8 +53,8 @@ GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
 // call with a worst-case graph to avoid buffer reallocations
 // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
 // returns false if the buffer allocation failed
-GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
-GGML_API bool ggml_gallocr_reserve_n(
+GGML_API enum ggml_status ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API enum ggml_status ggml_gallocr_reserve_n(
     ggml_gallocr_t galloc,
     struct ggml_cgraph * graph,
     const int * node_buffer_ids,
diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c
index 7244a9cbb..701bc30ae 100644
--- a/src/ggml-alloc.c
+++ b/src/ggml-alloc.c
@@ -94,7 +94,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
     size = GGML_PAD(size, talloc->alignment);
 
     if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
-        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
+        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate tensor '%s' (needed %zu, available %zu)\n",
                 __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
         GGML_ABORT("not enough space in the buffer");
     }
@@ -378,6 +378,7 @@ struct ggml_gallocr {
 };
 
 ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
+    //GGML_LOG_TRACE("%s: nbufs=%d\n", __func__, n_bufs);
     ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
     GGML_ASSERT(galloc != NULL);
 
@@ -670,7 +671,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
     }
 }
 
-bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+enum ggml_status ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+    //GGML_LOG_DEBUG("%s:  \n", __func__);
     size_t min_hash_size = graph->n_nodes + graph->n_leafs;
     // add 25% margin to avoid hash collisions
     min_hash_size += min_hash_size / 4;
@@ -771,16 +773,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
             if (galloc->buffers[i] == NULL) {
                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
-                return false;
+                return GGML_STATUS_ALLOC_FAILED;
             }
             ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
         }
     }
 
-    return true;
+    return GGML_STATUS_SUCCESS;
 }
 
-bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
+enum ggml_status ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
     return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
 }
 
@@ -865,13 +867,16 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
     return false;
 }
 
+// Check with reviewers: any cons to return a ggml_status here?
 bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
     if (ggml_gallocr_needs_realloc(galloc, graph)) {
         if (galloc->n_buffers == 1) {
 #ifndef NDEBUG
             GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
 #endif
-            if (!ggml_gallocr_reserve(galloc, graph)) {
+            enum ggml_status s = ggml_gallocr_reserve(galloc, graph);
+            if (s != GGML_STATUS_SUCCESS) {
+                GGML_LOG_INFO("%s: ggml_gallocr_reserve failed to reserve. status=%d \n", __func__, s);
                 return false;
             }
         } else {
diff --git a/src/ggml-backend.cpp b/src/ggml-backend.cpp
index dba7be33b..7ba351720 100644
--- a/src/ggml-backend.cpp
+++ b/src/ggml-backend.cpp
@@ -39,8 +39,14 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
         // return a dummy buffer for zero-sized allocations
         return ggml_backend_buffer_init(buft, {}, NULL, 0);
     }
-
-    return buft->iface.alloc_buffer(buft, size);
+    ggml_backend_buffer_t b = NULL;
+    try {
+      b = buft->iface.alloc_buffer(buft, size);
+    }  catch (const std::exception &e) {
+        GGML_LOG_ERROR("%s: iface.alloc_buffer failed: %s \n", __func__, e.what());
+        return NULL;
+    }
+    return b;
 }
 
 size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -172,6 +178,7 @@ enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer
 }
 
 ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
+    assert(buffer);
     return buffer->buft;
 }
 
@@ -329,7 +336,16 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
 }
 
 enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    return backend->iface.graph_compute(backend, cgraph);
+    ggml_status s;
+    try {
+        s = backend->iface.graph_compute(backend, cgraph);
+    } catch(std::bad_alloc &e) {
+        return GGML_STATUS_ALLOC_FAILED;
+    }  catch (std::exception &e) {
+        GGML_LOG_INFO("%s: graph_compute threw: %s", __func__, e.what());
+        return  GGML_STATUS_FAILED;
+    }
+    return s;
 }
 
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
diff --git a/src/ggml-cpu/amx/amx.cpp b/src/ggml-cpu/amx/amx.cpp
index 5ec5263ce..a598333ac 100644
--- a/src/ggml-cpu/amx/amx.cpp
+++ b/src/ggml-cpu/amx/amx.cpp
@@ -54,6 +54,7 @@ static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, st
     tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
 
     GGML_UNUSED(buffer);
+    return GGML_STATUS_SUCCESS;
 }
 
 static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
diff --git a/src/ggml-cuda/common.cuh b/src/ggml-cuda/common.cuh
index fd4dcfa94..9084198d7 100644
--- a/src/ggml-cuda/common.cuh
+++ b/src/ggml-cuda/common.cuh
@@ -120,8 +120,8 @@ static int ggml_cuda_highest_compiled_arch(const int arch) {
 
 #define GGML_CUDA_MAX_STREAMS 8
 
-[[noreturn]]
-void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
+// Print the error. Will also either abort or throw an exception.
+[[noreturn]] void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
 
 #define CUDA_CHECK_GEN(err, success, error_fn)                                      \
      do {                                                                           \
@@ -162,6 +162,7 @@ static const char * cu_get_error_str(CUresult err) {
     cuGetErrorString(err, &err_str);
     return err_str;
 }
+// Will print error and abort/throw
 #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
 #endif
 
diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu
index 6ea415777..ac858b32a 100644
--- a/src/ggml-cuda/ggml-cuda.cu
+++ b/src/ggml-cuda/ggml-cuda.cu
@@ -70,7 +70,13 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
     GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
     GGML_LOG_ERROR("  %s\n", stmt);
     // abort with GGML_ABORT to get a stack trace
-    GGML_ABORT(GGML_CUDA_NAME " error");
+    static const char* GGML_CUDA_NO_ABORT = getenv("GGML_CUDA_NO_ABORT");
+    if (!GGML_CUDA_NO_ABORT) {
+        GGML_ABORT(GGML_CUDA_NAME " error");
+    }
+#ifndef __CUDA_ARCH__
+    throw std::runtime_error(msg);
+#endif
 }
 
 // this is faster on Windows
@@ -92,6 +98,7 @@ int ggml_cuda_get_device() {
     return id;
 }
 
+// Note: Does not abort/throw because does not use CUDA_CHECK
 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
     ggml_cuda_set_device(device);
 #if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
@@ -536,7 +543,8 @@ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
 
 static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
+    GGML_ASSERT(tensor);
+    GGML_LOG_DEBUG("%s: t=%p %s\n", __func__, tensor, tensor->name);
     if (tensor->view_src != NULL) {
         assert(tensor->view_src->buffer->buft == buffer->buft);
         return;
@@ -945,8 +953,14 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg
     // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
     // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
     ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
-
-    return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
+    ggml_backend_buffer_t b = NULL;
+    try {
+        b = ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
+    }  catch (std::exception &e) {
+        GGML_LOG_ERROR("%s: ggml_backend_buffer_init threw: %s \n", __func__, e.what());
+        return NULL;
+    }
+    return b;
 }
 
 static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
diff --git a/src/ggml.c b/src/ggml.c
index e9f3420c2..65895918e 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -1681,6 +1681,7 @@ void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes) {
 }
 
 struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
+    assert(src);
     return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
 }
 
@@ -2328,6 +2329,8 @@ struct ggml_tensor * ggml_concat(
     struct ggml_tensor  * b,
     int                   dim) {
     GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
+    assert(a);
+    assert(b);
 
     int64_t ne[GGML_MAX_DIMS];
     for (int d = 0; d < GGML_MAX_DIMS; ++d) {
@@ -2695,6 +2698,8 @@ struct ggml_tensor * ggml_mul_mat(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b) {
+    assert(a);
+    assert(b);
     GGML_ASSERT(ggml_can_mul_mat(a, b));
     GGML_ASSERT(!ggml_is_transposed(a));
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 5db778cd8..3383a95f4 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -412,3 +412,9 @@ add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml)
 add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
 set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+set(TEST_TARGET test-oom)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
diff --git a/tests/test-arange.cpp b/tests/test-arange.cpp
index 4b7a98584..047ba4887 100644
--- a/tests/test-arange.cpp
+++ b/tests/test-arange.cpp
@@ -76,7 +76,7 @@ int main(int /*argc*/, const char** /*argv*/) {
             ggml_backend_cpu_set_n_threads(backend, n_threads);
         }
 
-        ggml_backend_graph_compute(backend, graph);
+        GGML_ASSERT(ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS);
 
         float * output = new float[ggml_nelements(t)];
         ggml_backend_tensor_get(t, output, 0, ggml_nbytes(t));
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 1bfd41254..ceea52d24 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -633,7 +633,9 @@ struct test_case {
         ggml_build_forward_expand(gf, out);
 
         // warmup run
-        ggml_backend_graph_compute(backend, gf);
+        ggml_status status = ggml_backend_graph_compute(backend, gf);
+        if (status != GGML_STATUS_SUCCESS)
+            printf("Warning: ggml_backend_graph_compute warmup failed: ggml status=%d \n", status);
 
         // determine number of runs
         int n_runs;
diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp
index 578d3e786..59096a78c 100644
--- a/tests/test-mul-mat.cpp
+++ b/tests/test-mul-mat.cpp
@@ -151,8 +151,9 @@ struct ggml_tensor* compute(const test_model & model, ggml_gallocr_t allocr) {
         ggml_backend_cpu_set_n_threads(model.backend, n_threads);
     }
 
-
-    ggml_backend_graph_compute(model.backend, gf);
+    ggml_status status = ggml_backend_graph_compute(model.backend, gf);
+    if (status != GGML_STATUS_SUCCESS)
+        return nullptr;
 
     //ggml_graph_print(gf);
 
@@ -313,6 +314,10 @@ int main(void)
     }
 
     struct ggml_tensor * result = compute(model, allocr);
+    if (!result) {
+        printf("ggml_mul_mat: failed to compute graph");
+        return EXIT_FAILURE;
+    }
 
     std::vector<float> out_data(ggml_nelements(result));
 
diff --git a/tests/test-oom.cpp b/tests/test-oom.cpp
new file mode 100644
index 000000000..14aaf9e88
--- /dev/null
+++ b/tests/test-oom.cpp
@@ -0,0 +1,235 @@
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <vector>
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "ggml-cpu.h"
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+struct test_model {
+    struct ggml_tensor * a = nullptr;
+    struct ggml_tensor * b = nullptr;
+    ggml_backend_t backend = NULL;
+    ggml_backend_buffer_t buffer = NULL;
+    struct ggml_context * ctx = nullptr;
+    int M =0, N=0, K=1;
+};
+
+// in MB
+size_t getCudaFreeMem() {
+    size_t cudafree = 0;
+    size_t cudatotal = 0;
+    ggml_backend_cuda_get_device_memory(0, &cudafree, &cudatotal);
+    return cudafree/1024/1024;
+}
+
+ggml_status load_model(test_model & model, unsigned S) {
+    size_t totalFreeMem = getCudaFreeMem();
+    printf("%s: cuda free: %ld MB \n", __func__, totalFreeMem);
+
+    // for a 2d matrix multiplication: K = shared dim, M=num rows for the left tensor A, N=num cols for the right tensor B
+    model.M = S;
+    model.N = S;
+    model.K = S;
+    printf("%s: M=%d N=%d K=%d \n", __func__, model.M, model.N, model.K);
+
+    size_t buffer_size = 0;
+    {
+        buffer_size += (model.M * model.K) * ggml_type_size(GGML_TYPE_F32); // tensor a
+        buffer_size += (model.K * model.N) * ggml_type_size(GGML_TYPE_F32); // tensor b
+        buffer_size += (model.M * model.N) * ggml_type_size(GGML_TYPE_F32); // output tensor
+        buffer_size += 1024; // overhead
+    }
+    printf("%s: backend buffer size = %ld KB\n", __func__, buffer_size/1024);
+
+    int num_tensors = 3;
+    struct ggml_init_params params {
+        /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true, //
+            };
+
+    // initialize the backend
+    printf("%s: using CUDA backend\n", __func__);
+    model.backend = ggml_backend_cuda_init(0);
+    if (!model.backend) {
+        printf("%s: ggml_backend_cuda_init() failed\n", __func__);
+        return GGML_STATUS_FAILED;
+    }
+
+    model.buffer = ggml_backend_alloc_buffer(model.backend, buffer_size);
+    if (!model.buffer) {
+        printf("%s: model.buffer null \n", __func__);
+        return GGML_STATUS_ALLOC_FAILED;
+    }
+
+    printf("%s: buffer allocated. cuda free: %ld MB \n", __func__, getCudaFreeMem());
+
+    // create context
+    model.ctx = ggml_init(params);
+    printf("%s: ctx created. cuda free: %ld MB \n", __func__, getCudaFreeMem());
+
+    // create tensors
+    printf("%s: creating input tensors...\n", __func__);
+    model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, model.K, model.M);
+    if (!model.a) {
+        printf("%s: cannot create tensor a \n", __func__);
+        abort();
+    }
+    model.a->name[0] = 'A';
+    //printf("Matrix A: [%i, %i]\n", K, M);
+    model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, model.K, model.N);
+    if(!model.b)
+        abort();
+    model.b->name[0] = 'B';
+    //printf("Matrix B: [%i, %i]\n", K, N);
+    printf("%s: tensors (a&b) created. cuda free: %ld MB \n", __func__, getCudaFreeMem());
+
+    // create an allocator
+    struct ggml_tallocr alloc = ggml_tallocr_new(model.buffer);
+
+    // alloc memory for a
+    ggml_tallocr_alloc(&alloc, model.a);
+
+    // alloc memory for b
+    ggml_tallocr_alloc(&alloc, model.b);
+    return GGML_STATUS_SUCCESS;
+}
+
+
+struct ggml_cgraph * build_graph(const test_model& model, ggml_tensor* a, ggml_tensor *b, unsigned repeat) {
+    printf("build_graph %d...\n", repeat);
+    static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+    printf("%s: graph buf size: %ld KB\n", __func__, buf_size/1024);
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
+    };
+
+    // create a temporally context to build the graph
+    struct ggml_context * ctx0 = ggml_init(params0);
+    if (!ctx0) {
+        printf("error: ggml_init returned null\n");
+        return nullptr;
+    }
+
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+    if (!gf)
+        return nullptr;
+
+    // zT = x @ yT
+    struct ggml_tensor * result = ggml_mul_mat(ctx0, a, ggml_cont(ctx0, b));
+    if (!result) {
+        printf("error:  ggml_mul_mat returned null\n");
+        return nullptr;
+    }
+
+    // z = (zT)T
+    struct ggml_tensor* T = ggml_transpose(ctx0, result);
+    if (!T) {
+        fprintf(stderr, "error: ggml_transpose returned null\n");
+        return nullptr;
+    }
+
+    struct ggml_tensor* c = ggml_cont(ctx0, T);
+    if (!c) {
+        fprintf(stderr, "error: ggml_cont returned null\n");
+        return nullptr;
+    }
+
+    std::vector<struct ggml_tensor *> outTensors;
+    outTensors.push_back(c);
+    for (unsigned i=0; i < repeat; i++) {
+        struct ggml_tensor * d = ggml_mul_mat(ctx0, outTensors.back(), ggml_cont(ctx0, outTensors.back()));
+        if (!d) {
+            printf("error:  ggml_mul_mat returned null\n");
+            return nullptr;
+        }
+        //printf("%s: matmul out: %s %ld %ld \n", __func__, d->name, d->ne[0], d->ne[1]);
+        outTensors.push_back(d);
+        c = ggml_concat(ctx0, c, d, 0);
+    }
+
+    ggml_build_forward_expand(gf, c);
+
+    // delete the temporally context used to build the graph
+    ggml_free(ctx0);
+    return gf;
+}
+
+ggml_status compute(const test_model & model, ggml_gallocr_t allocr, unsigned repeat) {
+    printf("compute ...\n");
+    printf("compute: free device mem: %ld MB\n", getCudaFreeMem());
+
+    ggml_tensor* ot = NULL;
+    ggml_tensor* left = model.a;
+    ggml_tensor* right = model.b;
+
+    struct ggml_cgraph * gf = build_graph(model, left, right, repeat);
+    printf("conpute: graph built. free cuda mem: %ld MB\n", getCudaFreeMem());
+
+    // allocate tensors
+    if (!ggml_gallocr_alloc_graph(allocr, gf))
+        return GGML_STATUS_ALLOC_FAILED;
+
+    printf("%s: graph buf allocated. free device mem: %ld MB\n", __func__, getCudaFreeMem());
+
+    ggml_status status = ggml_backend_graph_compute(model.backend, gf);
+    if (status != GGML_STATUS_SUCCESS)
+        return status;
+
+    ggml_graph_print(gf);
+    printf("compute: graph computed. free device mem: %ld MB\n", getCudaFreeMem());
+    // in this case, the output tensor is the last one in the graph
+    ot = ggml_graph_node(gf, -1);
+    if (!ot)
+        return GGML_STATUS_FAILED;
+    printf("%s: output tensor shape: %ld x %ld name: %s\n", __func__, ot->ne[0], ot->ne[1], ot->name);
+
+    return GGML_STATUS_SUCCESS;
+}
+
+
+int main(void) {
+#ifndef GGML_USE_CUDA
+    fprintf(stderr, "note: test-oom ony implemented for the cuda backend at the moment");
+    return 0;
+#endif
+
+    const char* GGML_CUDA_NO_ABORT = getenv("GGML_CUDA_NO_ABORT");
+    if (!GGML_CUDA_NO_ABORT) {
+      fprintf(stderr, "warning: skipping: test-oom requires the GGML_CUDA_NO_ABORT envvar to be set\n");
+      return 0;
+    }
+
+    test_model model;
+
+    ggml_status status = load_model(model, 8192); // will also init the backend
+    if (status != GGML_STATUS_SUCCESS) {
+        printf("failed to load model");
+        return GGML_EXIT_ABORTED;
+    }
+
+    ggml_gallocr_t allocr = NULL;
+    allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
+    if (!allocr) {
+        printf("Cannot ggml_gallocr_new\n");
+        return GGML_EXIT_ABORTED;
+    }
+
+    // will run multiple matmul in a lopp accumulating big output tensors. Should oom.
+    status = compute(model, allocr, 160);
+    if (status == GGML_STATUS_SUCCESS) {
+        printf("main: compute failed to oom (matmul too small to oom the GPU? for loop too smal ?)\n");
+        return GGML_EXIT_ABORTED;
+    }
+    printf("main: compute correctly OOM: ggml status=%d expected: %d \n", status, GGML_STATUS_ALLOC_FAILED);
+    return GGML_EXIT_SUCCESS;
+}
diff --git a/tests/test-timestep_embedding.cpp b/tests/test-timestep_embedding.cpp
index a55865973..e41942633 100644
--- a/tests/test-timestep_embedding.cpp
+++ b/tests/test-timestep_embedding.cpp
@@ -159,7 +159,7 @@ int main(int argc, const char** argv) {
             ggml_backend_cpu_set_n_threads(backend, n_threads);
         }
 
-        ggml_backend_graph_compute(backend, graph);
+        GGML_ASSERT(ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS);
 
         float * output = new float[ggml_nelements(t)];
         ggml_backend_tensor_get(t, output, 0, ggml_nbytes(t));