diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 864366c729e6b..5316617f956d0 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -672,7 +672,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in { if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7) { - ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format); + ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format, inputs.gpulayers); if(res==ModelLoadResult::FAIL) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); @@ -734,7 +734,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } else if(file_format==FileFormat::MPT_1) { - bool res = mpt_model_load(params.model, mpt_ctx_v3, vocab); + bool res = mpt_model_load(params.model, mpt_ctx_v3, vocab, inputs.gpulayers); if(res==false) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp index af7d8da5daefe..ba2222f9990f4 100644 --- a/otherarch/gpt2_v3.cpp +++ b/otherarch/gpt2_v3.cpp @@ -345,6 +345,29 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g fin.close(); + //gpu offload + #if defined(GGML_USE_CLBLAST) + if(gpulayers>0) + { + const auto & hparams = model.hparams; + size_t vram_total = 0; + const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + for (int i = 0; i < n_gpu; ++i) { + const auto & layer = model.layers[i]; + layer.c_attn_attn_w->backend = GGML_BACKEND_GPU; + layer.c_attn_proj_w->backend = GGML_BACKEND_GPU; + layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU; + layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU; + ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w); + ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); + ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); + ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); + } + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + } + #endif + return ModelLoadResult::SUCCESS; } diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index 57137b677608c..0f0f8210516b7 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -15,7 +15,9 @@ #include "model_adapter.h" - +#if defined(GGML_USE_CLBLAST) +#include "ggml-opencl.h" +#endif // load the model's weights from a file ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) { @@ -331,7 +333,32 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g fin.close(); - + //gpu offload + #if defined(GGML_USE_CLBLAST) + if(gpulayers>0) + { + const auto & hparams = model.hparams; + size_t vram_total = 0; + const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + for (int i = 0; i < n_gpu; ++i) { + const auto & layer = model.layers[i]; + layer.c_attn_q_proj_w->backend = GGML_BACKEND_GPU; + layer.c_attn_k_proj_w->backend = GGML_BACKEND_GPU; + layer.c_attn_v_proj_w->backend = GGML_BACKEND_GPU; + layer.c_attn_proj_w->backend = GGML_BACKEND_GPU; + layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU; + layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU; + ggml_cl_transform_tensor(layer.c_attn_q_proj_w->data,layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w); + ggml_cl_transform_tensor(layer.c_attn_k_proj_w->data,layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w); + ggml_cl_transform_tensor(layer.c_attn_v_proj_w->data,layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w); + ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); + ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); + ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); + } + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + } + #endif return ModelLoadResult::SUCCESS; } diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index 7f16701d1c50e..f7ab03ec0d958 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -15,10 +15,12 @@ #include "model_adapter.h" - +#if defined(GGML_USE_CLBLAST) +#include "ggml-opencl.h" +#endif // load the model's weights from a file -bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) { +bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab, int gpulayers) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); @@ -75,7 +77,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo std::string word; std::vector buf(128); - for (int i = 0; i < n_vocab; i++) { + for (int i = 0; i < n_vocab; i++) { uint32_t len; fin.read((char *) &len, sizeof(len)); @@ -278,6 +280,29 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo fin.close(); + //gpu offload + #if defined(GGML_USE_CLBLAST) + if(gpulayers>0) + { + const auto & hparams = model.hparams; + size_t vram_total = 0; + const int n_gpu = std::min(gpulayers, int(hparams.n_layers)); + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + for (int i = 0; i < n_gpu; ++i) { + const auto & layer = model.layers[i]; + layer.ffn_up_proj->backend = GGML_BACKEND_GPU; + layer.ffn_down_proj->backend = GGML_BACKEND_GPU; + layer.c_attn_wqkv_weight->backend = GGML_BACKEND_GPU; + layer.c_attn_out_proj_weight->backend = GGML_BACKEND_GPU; + ggml_cl_transform_tensor(layer.ffn_up_proj->data,layer.ffn_up_proj); vram_total += ggml_nbytes(layer.ffn_up_proj); + ggml_cl_transform_tensor(layer.ffn_down_proj->data,layer.ffn_down_proj); vram_total += ggml_nbytes(layer.ffn_down_proj); + ggml_cl_transform_tensor(layer.c_attn_wqkv_weight->data,layer.c_attn_wqkv_weight); vram_total += ggml_nbytes(layer.c_attn_wqkv_weight); + ggml_cl_transform_tensor(layer.c_attn_out_proj_weight->data,layer.c_attn_out_proj_weight); vram_total += ggml_nbytes(layer.c_attn_out_proj_weight); + } + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + } + #endif + return true; } diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp index c65ceac71fd55..3084bbda7630c 100644 --- a/otherarch/neox_v3.cpp +++ b/otherarch/neox_v3.cpp @@ -13,10 +13,12 @@ #include #include - +#if defined(GGML_USE_CLBLAST) +#include "ggml-opencl.h" +#endif // load the model's weights from a file -ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) { +ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); @@ -318,6 +320,29 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & fin.close(); + //gpu offload + #if defined(GGML_USE_CLBLAST) + if(gpulayers>0) + { + const auto & hparams = model.hparams; + size_t vram_total = 0; + const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + for (int i = 0; i < n_gpu; ++i) { + const auto & layer = model.layers[i]; + layer.c_attn_attn_w->backend = GGML_BACKEND_GPU; + layer.c_attn_proj_w->backend = GGML_BACKEND_GPU; + layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU; + layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU; + ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w); + ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); + ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); + ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); + } + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + } + #endif + return ModelLoadResult::SUCCESS; } diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h index 469315066a696..7d396453d1be0 100644 --- a/otherarch/otherarch.h +++ b/otherarch/otherarch.h @@ -43,7 +43,6 @@ struct gptj_layer { struct ggml_tensor * c_mlp_fc_b; struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_w_trans; //for backwards compatibility struct ggml_tensor * c_mlp_proj_b; }; struct gptj_layer_v2 {