Merge branch 'LostRuins:concedo' into main

LostRuins · Jun 22, 2023 · b6ff890 · b6ff890
2 parents eb094f0 + e6ddb15
commit b6ff890
Show file tree

Hide file tree

Showing 6 changed files with 109 additions and 10 deletions.
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -672,7 +672,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     {
         if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
         {
-            ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format);
+            ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format, inputs.gpulayers);
             if(res==ModelLoadResult::FAIL)
             {
                 fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
@@ -734,7 +734,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     }
     else if(file_format==FileFormat::MPT_1)
     {
-        bool res = mpt_model_load(params.model, mpt_ctx_v3, vocab);
+        bool res = mpt_model_load(params.model, mpt_ctx_v3, vocab, inputs.gpulayers);
         if(res==false)
         {
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());

diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp
@@ -345,6 +345,29 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
 
     fin.close();
 
+    //gpu offload
+    #if defined(GGML_USE_CLBLAST)
+    if(gpulayers>0)
+    {
+        const auto & hparams = model.hparams;
+        size_t vram_total = 0;
+        const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
+        fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
+        for (int i = 0; i < n_gpu; ++i) {
+            const auto & layer = model.layers[i];
+            layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
+            layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
+            layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
+            layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
+            ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
+            ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
+            ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
+            ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+        }
+        fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+    }
+    #endif
+
     return ModelLoadResult::SUCCESS;
 }
 

diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp
@@ -15,7 +15,9 @@
 
 #include "model_adapter.h"
 
-
+#if defined(GGML_USE_CLBLAST)
+#include "ggml-opencl.h"
+#endif
 
 // load the model's weights from a file
 ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
@@ -331,7 +333,32 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
 
     fin.close();
 
-
+    //gpu offload
+    #if defined(GGML_USE_CLBLAST)
+    if(gpulayers>0)
+    {
+        const auto & hparams = model.hparams;
+        size_t vram_total = 0;
+        const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
+        fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
+        for (int i = 0; i < n_gpu; ++i) {
+            const auto & layer = model.layers[i];
+            layer.c_attn_q_proj_w->backend = GGML_BACKEND_GPU;
+            layer.c_attn_k_proj_w->backend = GGML_BACKEND_GPU;
+            layer.c_attn_v_proj_w->backend = GGML_BACKEND_GPU;
+            layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
+            layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
+            layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
+            ggml_cl_transform_tensor(layer.c_attn_q_proj_w->data,layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w);
+            ggml_cl_transform_tensor(layer.c_attn_k_proj_w->data,layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w);
+            ggml_cl_transform_tensor(layer.c_attn_v_proj_w->data,layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w);
+            ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
+            ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
+            ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+        }
+        fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+    }
+    #endif
 
     return ModelLoadResult::SUCCESS;
 }

diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp
@@ -15,10 +15,12 @@
 
 #include "model_adapter.h"
 
-
+#if defined(GGML_USE_CLBLAST)
+#include "ggml-opencl.h"
+#endif
 
 // load the model's weights from a file
-bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) {
+bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab, int gpulayers) {
     printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
     auto fin = std::ifstream(fname, std::ios::binary);
@@ -75,7 +77,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
         std::string word;
         std::vector<char> buf(128);
 
-        for (int i = 0; i < n_vocab; i++) {           
+        for (int i = 0; i < n_vocab; i++) {
             uint32_t len;
             fin.read((char *) &len, sizeof(len));
 
@@ -278,6 +280,29 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
 
     fin.close();
 
+    //gpu offload
+    #if defined(GGML_USE_CLBLAST)
+    if(gpulayers>0)
+    {
+        const auto & hparams = model.hparams;
+        size_t vram_total = 0;
+        const int n_gpu = std::min(gpulayers, int(hparams.n_layers));
+        fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
+        for (int i = 0; i < n_gpu; ++i) {
+            const auto & layer = model.layers[i];
+            layer.ffn_up_proj->backend = GGML_BACKEND_GPU;
+            layer.ffn_down_proj->backend = GGML_BACKEND_GPU;
+            layer.c_attn_wqkv_weight->backend = GGML_BACKEND_GPU;
+            layer.c_attn_out_proj_weight->backend = GGML_BACKEND_GPU;
+            ggml_cl_transform_tensor(layer.ffn_up_proj->data,layer.ffn_up_proj); vram_total += ggml_nbytes(layer.ffn_up_proj);
+            ggml_cl_transform_tensor(layer.ffn_down_proj->data,layer.ffn_down_proj); vram_total += ggml_nbytes(layer.ffn_down_proj);
+            ggml_cl_transform_tensor(layer.c_attn_wqkv_weight->data,layer.c_attn_wqkv_weight); vram_total += ggml_nbytes(layer.c_attn_wqkv_weight);
+            ggml_cl_transform_tensor(layer.c_attn_out_proj_weight->data,layer.c_attn_out_proj_weight); vram_total += ggml_nbytes(layer.c_attn_out_proj_weight);
+        }
+        fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+    }
+    #endif
+
     return true;
 }
 

diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp
@@ -13,10 +13,12 @@
 #include <vector>
 #include <iostream>
 
-
+#if defined(GGML_USE_CLBLAST)
+#include "ggml-opencl.h"
+#endif
 
 // load the model's weights from a file
-ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) {
+ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
     printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
     auto fin = std::ifstream(fname, std::ios::binary);
@@ -318,6 +320,29 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
 
     fin.close();
 
+    //gpu offload
+    #if defined(GGML_USE_CLBLAST)
+    if(gpulayers>0)
+    {
+        const auto & hparams = model.hparams;
+        size_t vram_total = 0;
+        const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
+        fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
+        for (int i = 0; i < n_gpu; ++i) {
+            const auto & layer = model.layers[i];
+            layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
+            layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
+            layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
+            layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
+            ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
+            ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
+            ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
+            ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
+        }
+        fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+    }
+    #endif
+
     return ModelLoadResult::SUCCESS;
 }
 

diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h
@@ -43,7 +43,6 @@ struct gptj_layer {
     struct ggml_tensor * c_mlp_fc_b;
 
     struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_w_trans; //for backwards compatibility
     struct ggml_tensor * c_mlp_proj_b;
 };
 struct gptj_layer_v2 {