diff --git a/.gitignore b/.gitignore index 4bf0fa095..c86edad24 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ build/ +release/ +debug/ build-*/ out/ tmp/ diff --git a/include/ggml-alloc.h b/include/ggml-alloc.h index 23600eea9..700e7a529 100644 --- a/include/ggml-alloc.h +++ b/include/ggml-alloc.h @@ -53,8 +53,8 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); // call with a worst-case graph to avoid buffer reallocations // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed // returns false if the buffer allocation failed -GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); -GGML_API bool ggml_gallocr_reserve_n( +GGML_API enum ggml_status ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); +GGML_API enum ggml_status ggml_gallocr_reserve_n( ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index 7244a9cbb..701bc30ae 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -94,7 +94,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso size = GGML_PAD(size, talloc->alignment); if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { - GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", + GGML_LOG_ERROR("%s: not enough space in the buffer to allocate tensor '%s' (needed %zu, available %zu)\n", __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); GGML_ABORT("not enough space in the buffer"); } @@ -378,6 +378,7 @@ struct ggml_gallocr { }; ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) { + //GGML_LOG_TRACE("%s: nbufs=%d\n", __func__, n_bufs); ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr)); GGML_ASSERT(galloc != NULL); @@ -670,7 +671,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr } } -bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { +enum ggml_status ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { + //GGML_LOG_DEBUG("%s: \n", __func__); size_t min_hash_size = graph->n_nodes + graph->n_leafs; // add 25% margin to avoid hash collisions min_hash_size += min_hash_size / 4; @@ -771,16 +773,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); if (galloc->buffers[i] == NULL) { GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); - return false; + return GGML_STATUS_ALLOC_FAILED; } ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); } } - return true; + return GGML_STATUS_SUCCESS; } -bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { +enum ggml_status ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); } @@ -865,13 +867,16 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph return false; } +// Check with reviewers: any cons to return a ggml_status here? bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { if (ggml_gallocr_needs_realloc(galloc, graph)) { if (galloc->n_buffers == 1) { #ifndef NDEBUG GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__); #endif - if (!ggml_gallocr_reserve(galloc, graph)) { + enum ggml_status s = ggml_gallocr_reserve(galloc, graph); + if (s != GGML_STATUS_SUCCESS) { + GGML_LOG_INFO("%s: ggml_gallocr_reserve failed to reserve. status=%d \n", __func__, s); return false; } } else { diff --git a/src/ggml-backend.cpp b/src/ggml-backend.cpp index dba7be33b..7ba351720 100644 --- a/src/ggml-backend.cpp +++ b/src/ggml-backend.cpp @@ -39,8 +39,14 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t // return a dummy buffer for zero-sized allocations return ggml_backend_buffer_init(buft, {}, NULL, 0); } - - return buft->iface.alloc_buffer(buft, size); + ggml_backend_buffer_t b = NULL; + try { + b = buft->iface.alloc_buffer(buft, size); + } catch (const std::exception &e) { + GGML_LOG_ERROR("%s: iface.alloc_buffer failed: %s \n", __func__, e.what()); + return NULL; + } + return b; } size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) { @@ -172,6 +178,7 @@ enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer } ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) { + assert(buffer); return buffer->buft; } @@ -329,7 +336,16 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_ } enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - return backend->iface.graph_compute(backend, cgraph); + ggml_status s; + try { + s = backend->iface.graph_compute(backend, cgraph); + } catch(std::bad_alloc &e) { + return GGML_STATUS_ALLOC_FAILED; + } catch (std::exception &e) { + GGML_LOG_INFO("%s: graph_compute threw: %s", __func__, e.what()); + return GGML_STATUS_FAILED; + } + return s; } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { diff --git a/src/ggml-cpu/amx/amx.cpp b/src/ggml-cpu/amx/amx.cpp index 5ec5263ce..a598333ac 100644 --- a/src/ggml-cpu/amx/amx.cpp +++ b/src/ggml-cpu/amx/amx.cpp @@ -54,6 +54,7 @@ static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, st tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor); GGML_UNUSED(buffer); + return GGML_STATUS_SUCCESS; } static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, diff --git a/src/ggml-cuda/common.cuh b/src/ggml-cuda/common.cuh index fd4dcfa94..9084198d7 100644 --- a/src/ggml-cuda/common.cuh +++ b/src/ggml-cuda/common.cuh @@ -120,8 +120,8 @@ static int ggml_cuda_highest_compiled_arch(const int arch) { #define GGML_CUDA_MAX_STREAMS 8 -[[noreturn]] -void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg); +// Print the error. Will also either abort or throw an exception. +[[noreturn]] void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg); #define CUDA_CHECK_GEN(err, success, error_fn) \ do { \ @@ -162,6 +162,7 @@ static const char * cu_get_error_str(CUresult err) { cuGetErrorString(err, &err_str); return err_str; } +// Will print error and abort/throw #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) #endif diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu index 6ea415777..ac858b32a 100644 --- a/src/ggml-cuda/ggml-cuda.cu +++ b/src/ggml-cuda/ggml-cuda.cu @@ -70,7 +70,13 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); GGML_LOG_ERROR(" %s\n", stmt); // abort with GGML_ABORT to get a stack trace - GGML_ABORT(GGML_CUDA_NAME " error"); + static const char* GGML_CUDA_NO_ABORT = getenv("GGML_CUDA_NO_ABORT"); + if (!GGML_CUDA_NO_ABORT) { + GGML_ABORT(GGML_CUDA_NAME " error"); + } +#ifndef __CUDA_ARCH__ + throw std::runtime_error(msg); +#endif } // this is faster on Windows @@ -92,6 +98,7 @@ int ggml_cuda_get_device() { return id; } +// Note: Does not abort/throw because does not use CUDA_CHECK static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { ggml_cuda_set_device(device); #if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA) @@ -536,7 +543,8 @@ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - + GGML_ASSERT(tensor); + GGML_LOG_DEBUG("%s: t=%p %s\n", __func__, tensor, tensor->name); if (tensor->view_src != NULL) { assert(tensor->view_src->buffer->buft == buffer->buft); return; @@ -945,8 +953,14 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct. ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context(); - - return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); + ggml_backend_buffer_t b = NULL; + try { + b = ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); + } catch (std::exception &e) { + GGML_LOG_ERROR("%s: ggml_backend_buffer_init threw: %s \n", __func__, e.what()); + return NULL; + } + return b; } static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { diff --git a/src/ggml.c b/src/ggml.c index e9f3420c2..65895918e 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -1681,6 +1681,7 @@ void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes) { } struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) { + assert(src); return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne); } @@ -2328,6 +2329,8 @@ struct ggml_tensor * ggml_concat( struct ggml_tensor * b, int dim) { GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS); + assert(a); + assert(b); int64_t ne[GGML_MAX_DIMS]; for (int d = 0; d < GGML_MAX_DIMS; ++d) { @@ -2695,6 +2698,8 @@ struct ggml_tensor * ggml_mul_mat( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { + assert(a); + assert(b); GGML_ASSERT(ggml_can_mul_mat(a, b)); GGML_ASSERT(!ggml_is_transposed(a)); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5db778cd8..3383a95f4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -412,3 +412,9 @@ add_executable(${TEST_TARGET} ${TEST_TARGET}.c) target_link_libraries(${TEST_TARGET} PRIVATE ggml) add_test(NAME ${TEST_TARGET} COMMAND $) set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +set(TEST_TARGET test-oom) +add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") diff --git a/tests/test-arange.cpp b/tests/test-arange.cpp index 4b7a98584..047ba4887 100644 --- a/tests/test-arange.cpp +++ b/tests/test-arange.cpp @@ -76,7 +76,7 @@ int main(int /*argc*/, const char** /*argv*/) { ggml_backend_cpu_set_n_threads(backend, n_threads); } - ggml_backend_graph_compute(backend, graph); + GGML_ASSERT(ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS); float * output = new float[ggml_nelements(t)]; ggml_backend_tensor_get(t, output, 0, ggml_nbytes(t)); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 1bfd41254..ceea52d24 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -633,7 +633,9 @@ struct test_case { ggml_build_forward_expand(gf, out); // warmup run - ggml_backend_graph_compute(backend, gf); + ggml_status status = ggml_backend_graph_compute(backend, gf); + if (status != GGML_STATUS_SUCCESS) + printf("Warning: ggml_backend_graph_compute warmup failed: ggml status=%d \n", status); // determine number of runs int n_runs; diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp index 578d3e786..59096a78c 100644 --- a/tests/test-mul-mat.cpp +++ b/tests/test-mul-mat.cpp @@ -151,8 +151,9 @@ struct ggml_tensor* compute(const test_model & model, ggml_gallocr_t allocr) { ggml_backend_cpu_set_n_threads(model.backend, n_threads); } - - ggml_backend_graph_compute(model.backend, gf); + ggml_status status = ggml_backend_graph_compute(model.backend, gf); + if (status != GGML_STATUS_SUCCESS) + return nullptr; //ggml_graph_print(gf); @@ -313,6 +314,10 @@ int main(void) } struct ggml_tensor * result = compute(model, allocr); + if (!result) { + printf("ggml_mul_mat: failed to compute graph"); + return EXIT_FAILURE; + } std::vector out_data(ggml_nelements(result)); diff --git a/tests/test-oom.cpp b/tests/test-oom.cpp new file mode 100644 index 000000000..14aaf9e88 --- /dev/null +++ b/tests/test-oom.cpp @@ -0,0 +1,235 @@ +#include +#include +#include +#include +#include +#include "ggml.h" +#include "ggml-backend.h" +#include "ggml-cpu.h" +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +struct test_model { + struct ggml_tensor * a = nullptr; + struct ggml_tensor * b = nullptr; + ggml_backend_t backend = NULL; + ggml_backend_buffer_t buffer = NULL; + struct ggml_context * ctx = nullptr; + int M =0, N=0, K=1; +}; + +// in MB +size_t getCudaFreeMem() { + size_t cudafree = 0; + size_t cudatotal = 0; + ggml_backend_cuda_get_device_memory(0, &cudafree, &cudatotal); + return cudafree/1024/1024; +} + +ggml_status load_model(test_model & model, unsigned S) { + size_t totalFreeMem = getCudaFreeMem(); + printf("%s: cuda free: %ld MB \n", __func__, totalFreeMem); + + // for a 2d matrix multiplication: K = shared dim, M=num rows for the left tensor A, N=num cols for the right tensor B + model.M = S; + model.N = S; + model.K = S; + printf("%s: M=%d N=%d K=%d \n", __func__, model.M, model.N, model.K); + + size_t buffer_size = 0; + { + buffer_size += (model.M * model.K) * ggml_type_size(GGML_TYPE_F32); // tensor a + buffer_size += (model.K * model.N) * ggml_type_size(GGML_TYPE_F32); // tensor b + buffer_size += (model.M * model.N) * ggml_type_size(GGML_TYPE_F32); // output tensor + buffer_size += 1024; // overhead + } + printf("%s: backend buffer size = %ld KB\n", __func__, buffer_size/1024); + + int num_tensors = 3; + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, // + }; + + // initialize the backend + printf("%s: using CUDA backend\n", __func__); + model.backend = ggml_backend_cuda_init(0); + if (!model.backend) { + printf("%s: ggml_backend_cuda_init() failed\n", __func__); + return GGML_STATUS_FAILED; + } + + model.buffer = ggml_backend_alloc_buffer(model.backend, buffer_size); + if (!model.buffer) { + printf("%s: model.buffer null \n", __func__); + return GGML_STATUS_ALLOC_FAILED; + } + + printf("%s: buffer allocated. cuda free: %ld MB \n", __func__, getCudaFreeMem()); + + // create context + model.ctx = ggml_init(params); + printf("%s: ctx created. cuda free: %ld MB \n", __func__, getCudaFreeMem()); + + // create tensors + printf("%s: creating input tensors...\n", __func__); + model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, model.K, model.M); + if (!model.a) { + printf("%s: cannot create tensor a \n", __func__); + abort(); + } + model.a->name[0] = 'A'; + //printf("Matrix A: [%i, %i]\n", K, M); + model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, model.K, model.N); + if(!model.b) + abort(); + model.b->name[0] = 'B'; + //printf("Matrix B: [%i, %i]\n", K, N); + printf("%s: tensors (a&b) created. cuda free: %ld MB \n", __func__, getCudaFreeMem()); + + // create an allocator + struct ggml_tallocr alloc = ggml_tallocr_new(model.buffer); + + // alloc memory for a + ggml_tallocr_alloc(&alloc, model.a); + + // alloc memory for b + ggml_tallocr_alloc(&alloc, model.b); + return GGML_STATUS_SUCCESS; +} + + +struct ggml_cgraph * build_graph(const test_model& model, ggml_tensor* a, ggml_tensor *b, unsigned repeat) { + printf("build_graph %d...\n", repeat); + static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + printf("%s: graph buf size: %ld KB\n", __func__, buf_size/1024); + static std::vector buf(buf_size); + + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() + }; + + // create a temporally context to build the graph + struct ggml_context * ctx0 = ggml_init(params0); + if (!ctx0) { + printf("error: ggml_init returned null\n"); + return nullptr; + } + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + if (!gf) + return nullptr; + + // zT = x @ yT + struct ggml_tensor * result = ggml_mul_mat(ctx0, a, ggml_cont(ctx0, b)); + if (!result) { + printf("error: ggml_mul_mat returned null\n"); + return nullptr; + } + + // z = (zT)T + struct ggml_tensor* T = ggml_transpose(ctx0, result); + if (!T) { + fprintf(stderr, "error: ggml_transpose returned null\n"); + return nullptr; + } + + struct ggml_tensor* c = ggml_cont(ctx0, T); + if (!c) { + fprintf(stderr, "error: ggml_cont returned null\n"); + return nullptr; + } + + std::vector outTensors; + outTensors.push_back(c); + for (unsigned i=0; i < repeat; i++) { + struct ggml_tensor * d = ggml_mul_mat(ctx0, outTensors.back(), ggml_cont(ctx0, outTensors.back())); + if (!d) { + printf("error: ggml_mul_mat returned null\n"); + return nullptr; + } + //printf("%s: matmul out: %s %ld %ld \n", __func__, d->name, d->ne[0], d->ne[1]); + outTensors.push_back(d); + c = ggml_concat(ctx0, c, d, 0); + } + + ggml_build_forward_expand(gf, c); + + // delete the temporally context used to build the graph + ggml_free(ctx0); + return gf; +} + +ggml_status compute(const test_model & model, ggml_gallocr_t allocr, unsigned repeat) { + printf("compute ...\n"); + printf("compute: free device mem: %ld MB\n", getCudaFreeMem()); + + ggml_tensor* ot = NULL; + ggml_tensor* left = model.a; + ggml_tensor* right = model.b; + + struct ggml_cgraph * gf = build_graph(model, left, right, repeat); + printf("conpute: graph built. free cuda mem: %ld MB\n", getCudaFreeMem()); + + // allocate tensors + if (!ggml_gallocr_alloc_graph(allocr, gf)) + return GGML_STATUS_ALLOC_FAILED; + + printf("%s: graph buf allocated. free device mem: %ld MB\n", __func__, getCudaFreeMem()); + + ggml_status status = ggml_backend_graph_compute(model.backend, gf); + if (status != GGML_STATUS_SUCCESS) + return status; + + ggml_graph_print(gf); + printf("compute: graph computed. free device mem: %ld MB\n", getCudaFreeMem()); + // in this case, the output tensor is the last one in the graph + ot = ggml_graph_node(gf, -1); + if (!ot) + return GGML_STATUS_FAILED; + printf("%s: output tensor shape: %ld x %ld name: %s\n", __func__, ot->ne[0], ot->ne[1], ot->name); + + return GGML_STATUS_SUCCESS; +} + + +int main(void) { +#ifndef GGML_USE_CUDA + fprintf(stderr, "note: test-oom ony implemented for the cuda backend at the moment"); + return 0; +#endif + + const char* GGML_CUDA_NO_ABORT = getenv("GGML_CUDA_NO_ABORT"); + if (!GGML_CUDA_NO_ABORT) { + fprintf(stderr, "warning: skipping: test-oom requires the GGML_CUDA_NO_ABORT envvar to be set\n"); + return 0; + } + + test_model model; + + ggml_status status = load_model(model, 8192); // will also init the backend + if (status != GGML_STATUS_SUCCESS) { + printf("failed to load model"); + return GGML_EXIT_ABORTED; + } + + ggml_gallocr_t allocr = NULL; + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + if (!allocr) { + printf("Cannot ggml_gallocr_new\n"); + return GGML_EXIT_ABORTED; + } + + // will run multiple matmul in a lopp accumulating big output tensors. Should oom. + status = compute(model, allocr, 160); + if (status == GGML_STATUS_SUCCESS) { + printf("main: compute failed to oom (matmul too small to oom the GPU? for loop too smal ?)\n"); + return GGML_EXIT_ABORTED; + } + printf("main: compute correctly OOM: ggml status=%d expected: %d \n", status, GGML_STATUS_ALLOC_FAILED); + return GGML_EXIT_SUCCESS; +} diff --git a/tests/test-timestep_embedding.cpp b/tests/test-timestep_embedding.cpp index a55865973..e41942633 100644 --- a/tests/test-timestep_embedding.cpp +++ b/tests/test-timestep_embedding.cpp @@ -159,7 +159,7 @@ int main(int argc, const char** argv) { ggml_backend_cpu_set_n_threads(backend, n_threads); } - ggml_backend_graph_compute(backend, graph); + GGML_ASSERT(ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS); float * output = new float[ggml_nelements(t)]; ggml_backend_tensor_get(t, output, 0, ggml_nbytes(t));