diff --git a/examples/common.cpp b/examples/common.cpp index 92c7c07f8fe73..6ac4845559172 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -304,7 +304,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS params.main_gpu = std::stoi(argv[i]); #else fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); @@ -314,7 +314,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS std::string arg_next = argv[i]; // split string by , and / @@ -334,7 +334,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); #endif // GGML_USE_CUBLAS } else if (arg == "--low-vram" || arg == "-lv") { -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS params.low_vram = true; #else fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); @@ -414,7 +414,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { exit(1); } -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) { fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__); exit(1); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8aea0f7cd8f12..de22d301342d6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -565,7 +565,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, invalid_param = true; break; } -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS std::string arg_next = argv[i]; // split string by , and / @@ -588,7 +588,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, } else if (arg == "--low-vram" || arg == "-lv") { -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS params.low_vram = true; #else fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); @@ -599,7 +599,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, invalid_param = true; break; } -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS params.main_gpu = std::stoi(argv[i]); #else LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {}); diff --git a/ggml.c b/ggml.c index 961d90ce6fe84..3b8fc2e089cad 100644 --- a/ggml.c +++ b/ggml.c @@ -163,7 +163,7 @@ inline static void* ggml_aligned_malloc(size_t size) { #elif defined(GGML_USE_OPENBLAS) #include #endif -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_CUBLAS) #include "ggml-cuda.h" #endif #if defined(GGML_USE_CLBLAST) @@ -4119,7 +4119,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_CUBLAS) ggml_init_cublas(); #elif defined(GGML_USE_CLBLAST) ggml_cl_init(); @@ -14908,7 +14908,7 @@ static void ggml_compute_forward_cross_entropy_loss_back( static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS bool skip_cpu = ggml_cuda_compute_forward(params, tensor); if (skip_cpu) { return; @@ -16395,7 +16395,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) size_t cur = 0; -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_CUBLAS) if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning @@ -18696,7 +18696,7 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) return 1; #else return 0; diff --git a/llama-util.h b/llama-util.h index c3c891937da49..3d5d9e3792a1a 100644 --- a/llama-util.h +++ b/llama-util.h @@ -441,7 +441,7 @@ struct llama_buffer { llama_buffer& operator=(llama_buffer&&) = delete; }; -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" struct llama_ctx_buffer { uint8_t * addr = NULL; diff --git a/llama.cpp b/llama.cpp index 4489f35203633..e6da87f70bcf3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10,7 +10,7 @@ #include "llama.h" #include "ggml.h" -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) #include "ggml-opencl.h" @@ -175,7 +175,7 @@ struct llama_kv_cache { ggml_free(ctx); } -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS ggml_cuda_free_data(k); ggml_cuda_free_data(v); #endif // GGML_USE_CUBLAS @@ -234,7 +234,7 @@ struct llama_model { ggml_free(ctx); } -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS for (size_t i = 0; i < tensors_by_name.size(); ++i) { ggml_cuda_free_data(tensors_by_name[i].second); } @@ -800,7 +800,7 @@ struct llama_model_loader { lmlock->grow_to(lock_size); } break; -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_CUBLAS) case GGML_BACKEND_GPU: case GGML_BACKEND_GPU_SPLIT: ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor); @@ -920,7 +920,7 @@ static bool kv_cache_init( ggml_set_name(cache.v, "cache_v"); (void) n_gpu_layers; -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS if (n_gpu_layers > n_layer + 1) { ggml_cuda_assign_buffers_no_scratch(cache.v); } @@ -1150,7 +1150,7 @@ static void llama_model_load_internal( } (void) main_gpu; -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_CUBLAS) fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU @@ -1261,7 +1261,7 @@ static void llama_model_load_internal( (void) vram_scratch; (void) n_batch; -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS if (low_vram) { fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); ggml_cuda_set_scratch_size(0); // disable scratch @@ -1274,7 +1274,7 @@ static void llama_model_load_internal( } } #endif // GGML_USE_CUBLAS -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); @@ -1314,7 +1314,7 @@ static void llama_model_load_internal( } (void) tensor_split; -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_CUBLAS) { ggml_cuda_set_tensor_split(tensor_split); } @@ -1435,7 +1435,7 @@ static bool llama_eval_internal( offload_func_t offload_func_kq = llama_nop; offload_func_t offload_func_v = llama_nop; -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS if (n_gpu_layers > n_layer) { offload_func_nr = ggml_cuda_assign_buffers; } @@ -1450,7 +1450,7 @@ static bool llama_eval_internal( for (int il = 0; il < n_layer; ++il) { offload_func_t offload_func = llama_nop; -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS if (il >= i_gpu_start) { offload_func = ggml_cuda_assign_buffers; } diff --git a/llama.h b/llama.h index 085efcd7eecc1..a833a7f4d66cc 100644 --- a/llama.h +++ b/llama.h @@ -2,7 +2,7 @@ #define LLAMA_H #include "ggml.h" -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES #else @@ -46,7 +46,7 @@ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_VERSION 1 -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. #define LLAMA_SUPPORTS_GPU_OFFLOAD #endif diff --git a/otherarch/ggml_v2.c b/otherarch/ggml_v2.c index 6785f397112cc..4403de3d17a1c 100644 --- a/otherarch/ggml_v2.c +++ b/otherarch/ggml_v2.c @@ -140,7 +140,7 @@ inline static void* ggml_v2_aligned_malloc(size_t size) { #elif defined(GGML_USE_OPENBLAS) #include #endif -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_CUBLAS) #include "ggml_v2-cuda.h" #endif #if defined(GGML_USE_CLBLAST) @@ -3897,7 +3897,7 @@ struct ggml_v2_context * ggml_v2_init(struct ggml_v2_init_params params) { GGML_V2_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_CUBLAS) ggml_v2_init_cublas(); #elif defined(GGML_USE_CLBLAST) if(quants_unshuffled) @@ -9451,7 +9451,7 @@ static void ggml_v2_compute_forward_mul_mat_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_CUBLAS) if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) { if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); @@ -9645,7 +9645,7 @@ static void ggml_v2_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_CUBLAS) if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) { if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); @@ -9884,7 +9884,7 @@ static void ggml_v2_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_CUBLAS) if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) { if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); @@ -14064,7 +14064,7 @@ void ggml_v2_graph_compute(struct ggml_v2_context * ctx, struct ggml_v2_cgraph * size_t cur = 0; -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_CUBLAS) if (ggml_v2_cuda_can_mul_mat(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning @@ -15562,7 +15562,7 @@ int ggml_v2_cpu_has_wasm_simd(void) { } int ggml_v2_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) return 1; #else return 0; diff --git a/otherarch/llama_v2-util.h b/otherarch/llama_v2-util.h index 9f65eb0d2c39e..00aedf8e64ecd 100644 --- a/otherarch/llama_v2-util.h +++ b/otherarch/llama_v2-util.h @@ -415,7 +415,7 @@ struct llama_v2_buffer { llama_v2_buffer& operator=(llama_v2_buffer&&) = delete; }; -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS #include "ggml_v2-cuda.h" struct llama_v2_ctx_buffer { uint8_t * addr = NULL; diff --git a/otherarch/llama_v2.cpp b/otherarch/llama_v2.cpp index 103f89e3197b1..2f8e168ca299b 100644 --- a/otherarch/llama_v2.cpp +++ b/otherarch/llama_v2.cpp @@ -9,7 +9,7 @@ #include "llama_v2.h" #include "ggml_v2.h" -#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS +#ifdef GGML_USE_CUBLAS #include "ggml_v2-cuda.h" #elif defined(GGML_USE_CLBLAST) #include "ggml_v2-opencl.h" @@ -3088,4 +3088,4 @@ std::vector llama_v2_tokenize(struct llama_v2_context * ctx, const res.resize(n); return res; -} +} \ No newline at end of file