From 1727e652f11c2e3d8de7e4d743a98cac329ed8c3 Mon Sep 17 00:00:00 2001 From: shutup <> Date: Fri, 7 Jul 2023 11:52:58 +0800 Subject: [PATCH 01/34] expose some useful info that can be used in statistics of performence --- expose.cpp | 8 ++++++++ expose.h | 2 ++ gpttype_adapter.cpp | 6 ++++++ koboldcpp.py | 5 ++++- 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/expose.cpp b/expose.cpp index 67d634d5639bb..8f787f108b876 100644 --- a/expose.cpp +++ b/expose.cpp @@ -220,6 +220,14 @@ extern "C" return generation_finished; } + float get_prompt_eval_time() { + return prompt_eval_time; + } + + float get_prompt_process_time() { + return prompt_process_time; + } + const char* get_pending_output() { return gpttype_get_pending_output().c_str(); } diff --git a/expose.h b/expose.h index b74718eb98918..2425eabd63565 100644 --- a/expose.h +++ b/expose.h @@ -54,3 +54,5 @@ extern std::string lora_filename; extern std::string lora_base; extern std::vector generated_tokens; extern bool generation_finished; +extern float prompt_eval_time; +extern float prompt_process_time; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 8d49b67b36677..09e63716134aa 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -33,6 +33,8 @@ std::string executable_path = ""; std::string lora_filename = ""; std::string lora_base = ""; bool generation_finished; +float prompt_process_time; +float prompt_eval_time; std::vector generated_tokens; //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt) @@ -807,6 +809,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o bool stream_sse = inputs.stream_sse; generation_finished = false; // Set current generation status + prompt_eval_time = 0; + prompt_process_time = 0; generated_tokens.clear(); // New Generation, new tokens if (params.repeat_last_n < 1) @@ -1327,6 +1331,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o fflush(stdout); output.status = 1; generation_finished = true; + prompt_eval_time = pt2; + prompt_process_time = pt1; snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str()); return output; diff --git a/koboldcpp.py b/koboldcpp.py index 2041b0d24a7b4..26db2b432dc39 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -151,6 +151,8 @@ def init_library(): handle.new_token.argtypes = [ctypes.c_int] handle.get_stream_count.restype = ctypes.c_int handle.has_finished.restype = ctypes.c_bool + handle.get_prompt_eval_time.restype = ctypes.c_float + handle.get_prompt_process_time.restype = ctypes.c_float handle.abort_generate.restype = ctypes.c_bool handle.get_pending_output.restype = ctypes.c_char_p @@ -485,7 +487,8 @@ def do_POST(self): newprompt = fullprompt gen = asyncio.run(self.handle_request(genparams, newprompt, basic_api_flag, kai_sse_stream_flag)) - + gen['prompt_process_time'] = handle.get_prompt_process_time() + gen['prompt_eval_time'] = handle.get_prompt_eval_time() try: self.send_response(200) self.end_headers() From 3e08ae99ceb143d67f9273fda47541e9d98ff23f Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Fri, 7 Jul 2023 16:12:49 +0300 Subject: [PATCH 02/34] convert.py: add mapping for safetensors bf16 (#1598) Fixes #1473 --- convert.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert.py b/convert.py index 66509b99c8f3e..7a2705e5c506f 100644 --- a/convert.py +++ b/convert.py @@ -828,6 +828,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus: SAFETENSORS_DATA_TYPES: Dict[str, DataType] = { + 'BF16': DT_BF16, 'F16': DT_F16, 'F32': DT_F32, 'I32': DT_I32, From 72421402834141df6cbdcf595fe46dbd11874dce Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 7 Jul 2023 18:36:37 +0300 Subject: [PATCH 03/34] ggml : remove sched_yield() call in ggml_graph_compute_thread() (#2134) --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index d257c3d657b34..4ba7ac9313820 100644 --- a/ggml.c +++ b/ggml.c @@ -16042,7 +16042,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { // wait for other threads to finish const int last = node_n; do { - sched_yield(); + //sched_yield(); node_n = atomic_load(&state->shared->node_n); } while (node_n == last); } From 8edcb337c6e763beea97f2a604480cb8f8a62825 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 7 Jul 2023 23:37:55 +0800 Subject: [PATCH 04/34] added ability to select "all devices" --- koboldcpp.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 53ffe41c331ef..998a54aa5776e 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -763,7 +763,7 @@ def getfilename(var, text): # gpu options quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 4, 50) quick_gpu_selector_label = makelabel(quick_tab, "GPU ID:", 3) - quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly") + quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3","All"], width=60, variable=gpu_choice_var, state="readonly") quick_lowvram_box = makecheckbox(quick_tab, "Low VRAM", lowvram_var, 5) # hides gpu options when CLBlast is not chosen @@ -828,7 +828,7 @@ def changerunmode(a,b,c): # gpu options gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 4, 50) gpu_selector_label = makelabel(hardware_tab, "GPU ID:", 3) - gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly") + gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","All"], width=60, variable=gpu_choice_var, state="readonly") lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 5) # presets selector @@ -959,11 +959,16 @@ def switch_old_gui(): args.smartcontext = smartcontext.get()==1 args.unbantokens = unbantokens.get()==1 - gpuchoiceidx = int(gpu_choice_var.get())-1 + gpuchoiceidx = 0 + if gpu_choice_var.get()!="All": + gpuchoiceidx = int(gpu_choice_var.get())-1 if runopts_var.get() == runopts[1]: args.useclblast = [[0,0], [1,0], [0,1]][gpuchoiceidx] if runopts_var.get() == runopts[2]: - args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)] + if gpu_choice_var.get()=="All": + args.usecublas = ["lowvram"] if lowvram_var.get() == 1 else ["normal"] + else: + args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)] if gpulayers_var.get(): args.gpulayers = int(gpulayers_var.get()) if runopts_var.get()==runopts[3]: From 1d656d6360359cfdaaf5d64ed9690047b600dbcb Mon Sep 17 00:00:00 2001 From: Qingyou Meng Date: Sat, 8 Jul 2023 00:24:01 +0800 Subject: [PATCH 05/34] ggml : change ggml_graph_compute() API to not require context (#1999) * ggml_graph_compute: deprecate using ggml_context, try resolve issue #287 * rewrite: no longer consider backward compitability; plan and make_plan * minor: rename ctx as plan; const * remove ggml_graph_compute from tests/test-grad0.c, but current change breaks backward * add static ggml_graph_compute_sugar() * minor: update comments * reusable buffers * ggml : more consistent naming + metal fixes * ggml : fix docs * tests : disable grad / opt + minor naming changes * ggml : add ggml_graph_compute_with_ctx() - backwards compatible API - deduplicates a lot of copy-paste * ci : enable test-grad0 * examples : factor out plan allocation into a helper function * llama : factor out plan stuff into a helper function * ci : fix env * llama : fix duplicate symbols + refactor example benchmark * ggml : remove obsolete assert + refactor n_tasks section * ggml : fix indentation in switch * llama : avoid unnecessary bool * ggml : remove comments from source file and match order in header --------- Co-authored-by: Georgi Gerganov --- .github/workflows/build.yml | 13 +- examples/baby-llama/baby-llama.cpp | 24 +- examples/benchmark/benchmark-matmult.cpp | 29 +- examples/metal/metal.cpp | 3 +- .../train-text-from-scratch.cpp | 27 +- ggml-metal.h | 6 +- ggml-metal.m | 11 +- ggml.c | 682 +++++++++--------- ggml.h | 36 +- llama.cpp | 54 +- tests/CMakeLists.txt | 2 +- tests/test-grad0.c | 35 +- tests/test-opt.c | 18 +- 13 files changed, 531 insertions(+), 409 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 12481e8be7cf7..a576139efd0ee 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -16,7 +16,9 @@ on: paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu'] env: - BRANCH_NAME: ${{ github.head_ref || github.ref_name }} + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} + GGML_NLOOP: 3 + GGML_NITER: 1 jobs: ubuntu-focal-make: @@ -64,7 +66,7 @@ jobs: id: cmake_test run: | cd build - ctest --verbose + ctest --verbose --timeout 900 ubuntu-latest-cmake-sanitizer: runs-on: ubuntu-latest @@ -99,7 +101,7 @@ jobs: id: cmake_test run: | cd build - ctest --verbose + ctest --verbose --timeout 900 macOS-latest-make: runs-on: macos-latest @@ -147,10 +149,11 @@ jobs: id: cmake_test run: | cd build - ctest --verbose + ctest --verbose --timeout 900 windows-latest-cmake: runs-on: windows-latest + env: OPENBLAS_VERSION: 0.3.23 OPENCL_VERSION: 2023.04.17 @@ -249,7 +252,7 @@ jobs: if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible run: | cd build - ctest -C Release --verbose + ctest -C Release --verbose --timeout 900 - name: Get commit hash id: commit diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 212f54d32cbad..4965881ecec22 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -31,6 +31,17 @@ float frand_normal(struct random_normal_distribution * rnd) { return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r); } +void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + struct ggml_tensor * randomize_tensor( struct ggml_tensor * tensor, int ndims, @@ -1569,6 +1580,8 @@ int main(int argc, char ** argv) { int n_tokens = model.hparams.n_ctx; int n_vocab = model.hparams.n_vocab; + std::vector work_buffer; + for (int ex=0; ex & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + float tensor_sum_elements(const ggml_tensor * tensor) { float sum = 0; if (tensor->type==GGML_TYPE_F32) { @@ -159,13 +170,14 @@ int main(int argc, char ** argv) { // printf("Creating compute graph\n"); struct ggml_cgraph gf = ggml_build_forward(m11xm2); - gf.n_threads=benchmark_params.n_threads; - printf("cgraph->n_threads=%i\n",gf.n_threads); + printf("n_threads=%i\n", benchmark_params.n_threads); TENSOR_DUMP(m11); TENSOR_DUMP(m2); - ggml_graph_compute(ctx, &gf); + std::vector work_buffer; + + ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads); TENSOR_DUMP(gf.nodes[0]); @@ -187,7 +199,6 @@ int main(int argc, char ** argv) { // printf("Creating compute graph\n"); struct ggml_cgraph gf31 = ggml_build_forward(q31); - gf31.n_threads=benchmark_params.n_threads; // Set up a second graph computation to make sure we override the CPU cache lines // printf("Creating new tensor q12 & Running quantize\n"); @@ -199,8 +210,7 @@ int main(int argc, char ** argv) { //printf("Creating compute graph\n"); struct ggml_cgraph gf32 = ggml_build_forward(q32); - gf32.n_threads=benchmark_params.n_threads; - printf("cgraph->n_threads=%i\n",gf31.n_threads); + printf("n_threads=%i\n", benchmark_params.n_threads); const int dimx = sizex; const int dimy = sizey; @@ -221,14 +231,15 @@ int main(int argc, char ** argv) { long long int start = ggml_time_us(); //printf("Running ggml_graph_compute\n"); - ggml_graph_compute(ctx, &gf31); + ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads); + long long int stop = ggml_time_us(); long long int usec = stop-start; double gflops = (double)(flops_per_matrix)/usec/1000.0; gflops_sum += gflops; printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n", i, - gf31.n_threads, + benchmark_params.n_threads, sizex, sizey, sizez, flops_per_matrix, usec,gflops); @@ -253,7 +264,7 @@ int main(int argc, char ** argv) { } // Running a different graph computation to make sure we override the CPU cache lines - ggml_graph_compute(ctx, &gf32); + ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads); } printf("\n"); printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp index cdfe4bfe97865..7438defdefcdf 100644 --- a/examples/metal/metal.cpp +++ b/examples/metal/metal.cpp @@ -35,10 +35,9 @@ int main(int argc, char ** argv) { struct ggml_context * ctx_eval = NULL; struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); - gf.n_threads = 1; // this allocates all Metal resources and memory buffers - auto * ctx_metal = ggml_metal_init(); + auto * ctx_metal = ggml_metal_init(1); const size_t max_size_data = ggml_get_max_tensor_size(ctx_data); const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index c50eeb343bcef..b96fdcdc44b57 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -60,6 +60,17 @@ float frand_uniform(struct random_uniform_distribution * rnd) { return rnd->rd(rnd->gen); } +void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { float scale = 1.0f; // xavier switch (tensor->n_dims) { @@ -1426,11 +1437,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( gf->n_nodes = 0; gf->n_leafs = 0; - gf->work_size = 0; gf->perf_runs = 0; gf->perf_cycles = 0; gf->perf_time_us = 0; - gf->work = NULL; const auto & hparams = model->hparams; //const int n_ctx = hparams.n_ctx; @@ -3162,6 +3171,7 @@ int main(int argc, char ** argv) { printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx)); // ggml_print_tensor_objects(model.ctx); + // TODO: use std::vector intead of "new" size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb); uint8_t * compute_addr = new uint8_t[compute_size]; @@ -3183,6 +3193,8 @@ int main(int argc, char ** argv) { GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); } + std::vector work_buffer; + printf("%s: begin training\n", __func__); for (int ex = 0; ex < params.n_examples; ++ex) { @@ -3217,9 +3229,6 @@ int main(int argc, char ** argv) { struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; - // ggml_cgraph gf = {}; - gf->n_threads = params.n_threads; - gb->n_threads = params.n_threads; get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs); @@ -3248,7 +3257,7 @@ int main(int argc, char ** argv) { *gb = ggml_build_backward(ctx0, gf, true); } - ggml_graph_compute(ctx0, gf); + ggml_graph_compute_helper(work_buffer, gf, params.n_threads); size_t used_mem_before_opt = ggml_used_mem(ctx0); @@ -3272,7 +3281,7 @@ int main(int argc, char ** argv) { model.train_samples += n_batch; model.train_tokens += n_batch * n_tokens; - ggml_graph_compute(ctx0, gf); + ggml_graph_compute_helper(work_buffer, gf, params.n_threads); float error_after_opt = ggml_get_f32_1d(loss, 0); @@ -3354,13 +3363,12 @@ int main(int argc, char ** argv) { struct ggml_context * ctx0 = ggml_init(cparams); ggml_cgraph gf = {}; - gf.n_threads = params.n_threads; int n_past = 0; struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past); ggml_build_forward_expand(&gf, logits); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_helper(work_buffer, &gf, params.n_threads); //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); //struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx); @@ -3386,6 +3394,7 @@ int main(int argc, char ** argv) { delete[] compute_addr; delete[] compute_buf_0; delete[] compute_buf_1; + llama_free(lctx); llama_free_model(lmodel); ggml_free(model.ctx); diff --git a/ggml-metal.h b/ggml-metal.h index b9e50ac745eb0..928f1705c381c 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -34,9 +34,13 @@ extern "C" { struct ggml_metal_context; -struct ggml_metal_context * ggml_metal_init(void); +// number of command buffers to use +struct ggml_metal_context * ggml_metal_init(int n_cb); void ggml_metal_free(struct ggml_metal_context * ctx); +// set the number of command buffers to use +void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); + // creates a mapping between a host memory buffer and a device memory buffer // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute // - the mapping is used during computation to determine the arguments of the compute kernels diff --git a/ggml-metal.m b/ggml-metal.m index fd69c41fe357d..3f15f791f9f65 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -25,6 +25,8 @@ }; struct ggml_metal_context { + int n_cb; + float * logits; id device; @@ -86,11 +88,12 @@ @interface GGMLMetalClass : NSObject @implementation GGMLMetalClass @end -struct ggml_metal_context * ggml_metal_init(void) { +struct ggml_metal_context * ggml_metal_init(int n_cb) { fprintf(stderr, "%s: allocating\n", __func__); struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + ctx->n_cb = n_cb; ctx->device = MTLCreateSystemDefaultDevice(); ctx->queue = [ctx->device newCommandQueue]; ctx->n_buffers = 0; @@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { free(ctx); } +void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { + ctx->n_cb = n_cb; +} + // finds the Metal buffer that contains the tensor data on the GPU device // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer @@ -354,7 +361,7 @@ void ggml_metal_graph_compute( // create multiple command buffers and enqueue them // then, we encode the graph into the command buffers in parallel - const int n_cb = gf->n_threads; + const int n_cb = ctx->n_cb; NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb]; diff --git a/ggml.c b/ggml.c index 4ba7ac9313820..55b0aff03bf16 100644 --- a/ggml.c +++ b/ggml.c @@ -4583,14 +4583,13 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.src0 =*/ NULL, /*.src1 =*/ NULL, /*.opt =*/ { NULL }, - /*.n_tasks =*/ 0, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, - /*.pad =*/ { 0 }, + /*.padding =*/ { 0 }, }; // TODO: this should not be needed as long as we don't rely on aligned SIMD loads @@ -10718,8 +10717,6 @@ static void ggml_compute_forward_mul_mat( float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); - assert(ne00 % 32 == 0); - for (int64_t ic = 0; ic < ne11; ++ic) { vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size)); } @@ -15772,9 +15769,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { struct ggml_cgraph result = { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, - /*.n_threads =*/ GGML_DEFAULT_N_THREADS, - /*.work_size =*/ 0, - /*.work =*/ NULL, /*.nodes =*/ { NULL }, /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, @@ -15945,12 +15939,13 @@ void clear_numa_thread_affinity(void) {} #endif struct ggml_compute_state_shared { - struct ggml_cgraph * cgraph; + const struct ggml_cgraph * cgraph; + const struct ggml_cplan * cplan; int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; - int n_threads; + const int n_threads; // synchronization primitives atomic_int n_active; // num active threads @@ -15974,9 +15969,13 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; - struct ggml_cgraph * cgraph = state->shared->cgraph; - const int n_threads = state->shared->n_threads; + const struct ggml_cgraph * cgraph = state->shared->cgraph; + const struct ggml_cplan * cplan = state->shared->cplan; + + const int * n_tasks_arr = cplan->n_tasks; + const int n_threads = state->shared->n_threads; + set_numa_thread_affinity(state->ith, n_threads); int node_n = -1; @@ -15989,15 +15988,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_FINALIZE, /*.ith =*/ 0, /*.nth =*/ 0, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, }; if (node_n != -1) { /* FINALIZE */ struct ggml_tensor * node = state->shared->cgraph->nodes[node_n]; if (GGML_OP_HAS_FINALIZE[node->op]) { - params.nth = node->n_tasks; + params.nth = n_tasks_arr[node_n]; ggml_compute_forward(¶ms, node); ggml_graph_compute_perf_stats_node(node, state->shared); } @@ -16008,11 +16007,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = n_tasks_arr[node_n]; state->shared->perf_node_start_cycles = ggml_perf_cycles(); state->shared->perf_node_start_time_us = ggml_perf_time_us(); - params.nth = node->n_tasks; + params.nth = n_tasks; /* INIT */ if (GGML_OP_HAS_INIT[node->op]) { @@ -16020,7 +16020,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); } - if (node->n_tasks == 1) { + if (n_tasks == 1) { // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, // they do something more efficient than spinning (?) params.type = GGML_TASK_COMPUTE; @@ -16052,16 +16052,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /* COMPUTE */ struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = n_tasks_arr[node_n]; struct ggml_compute_params params = { /*.type =*/ GGML_TASK_COMPUTE, /*.ith =*/ state->ith, - /*.nth =*/ node->n_tasks, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.nth =*/ n_tasks, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, }; - if (state->ith < node->n_tasks) { + if (state->ith < n_tasks) { ggml_compute_forward(¶ms, node); } } @@ -16069,349 +16070,372 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return 0; } -void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - const int n_threads = cgraph->n_threads; +struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { + if (n_threads <= 0) { + n_threads = GGML_DEFAULT_N_THREADS; + } - struct ggml_compute_state_shared state_shared = { - /*.cgraph =*/ cgraph, - /*.perf_node_start_cycles =*/ 0, - /*.perf_node_start_time_us =*/ 0, - /*.n_threads =*/ n_threads, - /*.n_active =*/ n_threads, - /*.node_n =*/ -1, - }; - struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + size_t work_size = 0; - // initialize tasks + work buffer - { - size_t work_size = 0; + struct ggml_cplan cplan; + memset(&cplan, 0, sizeof(struct ggml_cplan)); - // thread scheduling for the different operations - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; + // thread scheduling for the different operations + work buffer size estimation + for (int i = 0; i < cgraph->n_nodes; i++) { + int n_tasks = 1; - switch (node->op) { - case GGML_OP_CPY: - case GGML_OP_DUP: - { - node->n_tasks = n_threads; + struct ggml_tensor * node = cgraph->nodes[i]; - size_t cur = 0; - if (ggml_is_quantized(node->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads; - } + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: + { + n_tasks = n_threads; - work_size = MAX(work_size, cur); - } break; - case GGML_OP_ADD: - case GGML_OP_ADD1: - { - node->n_tasks = n_threads; + size_t cur = 0; + if (ggml_is_quantized(node->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks; + } - size_t cur = 0; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ADD: + case GGML_OP_ADD1: + { + n_tasks = n_threads; - if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; - } + size_t cur = 0; - work_size = MAX(work_size, cur); - } break; - case GGML_OP_ACC: - { - node->n_tasks = n_threads; + if (ggml_is_quantized(node->src0->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks; + } - size_t cur = 0; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ACC: + { + n_tasks = n_threads; - if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads; - } + size_t cur = 0; + + if (ggml_is_quantized(node->src0->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks; + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SUB: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + case GGML_OP_ARGMAX: + case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: + case GGML_OP_ABS: + case GGML_OP_SGN: + case GGML_OP_NEG: + case GGML_OP_STEP: + case GGML_OP_TANH: + case GGML_OP_ELU: + case GGML_OP_RELU: + { + n_tasks = 1; + } break; + case GGML_OP_MUL: + case GGML_OP_GELU: + case GGML_OP_GELU_QUICK: + case GGML_OP_SILU: + case GGML_OP_SILU_BACK: + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: + { + n_tasks = n_threads; + } break; + case GGML_OP_MUL_MAT: + case GGML_OP_OUT_PROD: + { + n_tasks = n_threads; - work_size = MAX(work_size, cur); - } break; - case GGML_OP_SUB: - case GGML_OP_DIV: - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_LOG: - case GGML_OP_SUM: - case GGML_OP_SUM_ROWS: - case GGML_OP_MEAN: - case GGML_OP_ARGMAX: - case GGML_OP_REPEAT: - case GGML_OP_REPEAT_BACK: - case GGML_OP_ABS: - case GGML_OP_SGN: - case GGML_OP_NEG: - case GGML_OP_STEP: - case GGML_OP_TANH: - case GGML_OP_ELU: - case GGML_OP_RELU: - { - node->n_tasks = 1; - } break; - case GGML_OP_MUL: - case GGML_OP_GELU: - case GGML_OP_GELU_QUICK: - case GGML_OP_SILU: - case GGML_OP_SILU_BACK: - case GGML_OP_NORM: - case GGML_OP_RMS_NORM: - case GGML_OP_RMS_NORM_BACK: - { - node->n_tasks = n_threads; - } break; - case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: - { - node->n_tasks = n_threads; - - // TODO: use different scheduling for different matrix sizes - //const int nr0 = ggml_nrows(node->src0); - //const int nr1 = ggml_nrows(node->src1); - - //node->n_tasks = MIN(n_threads, MAX(1, nr0/128)); - //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks); - - size_t cur = 0; - const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type; + // TODO: use different scheduling for different matrix sizes + //const int nr0 = ggml_nrows(node->src0); + //const int nr1 = ggml_nrows(node->src1); + + //n_tasks = MIN(n_threads, MAX(1, nr0/128)); + //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); + + size_t cur = 0; + const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type; #if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } - else + if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + } else #elif defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); - } - else + if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); + } else #endif #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - if (node->src0->type != GGML_TYPE_F32) { - // here we need memory just for single 2D matrix from src0 - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } - } else -#endif - if (node->src1->type != vec_dot_type) { - cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type]; - } else { - cur = 0; + if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + if (node->src0->type != GGML_TYPE_F32) { + // here we need memory just for single 2D matrix from src0 + cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); } + } else +#endif + if (node->src1->type != vec_dot_type) { + cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type]; + } else { + cur = 0; + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_SCALE: - { - node->n_tasks = 1; - } break; - case GGML_OP_SET: - case GGML_OP_CONT: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - case GGML_OP_GET_ROWS: - case GGML_OP_GET_ROWS_BACK: - case GGML_OP_DIAG: - case GGML_OP_DIAG_MASK_ZERO: - { - node->n_tasks = 1; - } break; - case GGML_OP_DIAG_MASK_INF: - case GGML_OP_SOFT_MAX: - case GGML_OP_SOFT_MAX_BACK: - case GGML_OP_ROPE: - case GGML_OP_ROPE_BACK: - { - node->n_tasks = n_threads; - } break; - case GGML_OP_ALIBI: - { - node->n_tasks = 1; //TODO - } break; - case GGML_OP_CLAMP: - { - node->n_tasks = 1; //TODO - } break; - case GGML_OP_CONV_1D: - { - node->n_tasks = n_threads; - - GGML_ASSERT(node->src0->ne[3] == 1); - GGML_ASSERT(node->src1->ne[2] == 1); - GGML_ASSERT(node->src1->ne[3] == 1); - - size_t cur = 0; - const int nk = node->src0->ne[0]; - - if (node->src0->type == GGML_TYPE_F16 && + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SCALE: + { + n_tasks = 1; + } break; + case GGML_OP_SET: + case GGML_OP_CONT: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS: + case GGML_OP_GET_ROWS_BACK: + case GGML_OP_DIAG: + case GGML_OP_DIAG_MASK_ZERO: + { + n_tasks = 1; + } break; + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_MAX_BACK: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + { + n_tasks = n_threads; + } break; + case GGML_OP_ALIBI: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_CLAMP: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_CONV_1D: + { + n_tasks = n_threads; + + GGML_ASSERT(node->src0->ne[3] == 1); + GGML_ASSERT(node->src1->ne[2] == 1); + GGML_ASSERT(node->src1->ne[3] == 1); + + size_t cur = 0; + const int nk = node->src0->ne[0]; + + if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*( - nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + - ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] - ); - } else if (node->src0->type == GGML_TYPE_F32 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*( - nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + - ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] - ); - } else { - GGML_ASSERT(false); - } + cur = sizeof(ggml_fp16_t)*( + nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + + ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] + ); + } else if (node->src0->type == GGML_TYPE_F32 && + node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*( + nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + + ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] + ); + } else { + GGML_ASSERT(false); + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_CONV_2D: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CONV_2D: + { + n_tasks = n_threads; - GGML_ASSERT(node->src1->ne[3] == 1); + GGML_ASSERT(node->src1->ne[3] == 1); - const int64_t ne00 = node->src0->ne[0]; // W - const int64_t ne01 = node->src0->ne[1]; // H - const int64_t ne02 = node->src0->ne[2]; // C - const int64_t ne03 = node->src0->ne[3]; // N + const int64_t ne00 = node->src0->ne[0]; // W + const int64_t ne01 = node->src0->ne[1]; // H + const int64_t ne02 = node->src0->ne[2]; // C + const int64_t ne03 = node->src0->ne[3]; // N - const int64_t ne10 = node->src1->ne[0]; // W - const int64_t ne11 = node->src1->ne[1]; // H - const int64_t ne12 = node->src1->ne[2]; // C + const int64_t ne10 = node->src1->ne[0]; // W + const int64_t ne11 = node->src1->ne[1]; // H + const int64_t ne12 = node->src1->ne[2]; // C - const int64_t nk = ne00*ne01; + const int64_t nk = ne00*ne01; - UNUSED(ne02); - UNUSED(ne03); - UNUSED(nk); + UNUSED(ne02); + UNUSED(ne03); + UNUSED(nk); - size_t cur = 0; + size_t cur = 0; - if (node->src0->type == GGML_TYPE_F16 && + if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12); - } else if (node->src0->type == GGML_TYPE_F32 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)* (ne10*ne11*ne12); - } else { - GGML_ASSERT(false); - } + cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12); + } else if (node->src0->type == GGML_TYPE_F32 && + node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)* (ne10*ne11*ne12); + } else { + GGML_ASSERT(false); + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_ATTN: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_ATTN: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = 0; - const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_FF: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_FF: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = 0; - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2 + } - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2 + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_ATTN_BACK: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = 0; - const int64_t D = node->src0->ne[0]; - const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); - const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 - } + const int64_t D = node->src0->ne[0]; + const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_WIN_PART: - case GGML_OP_WIN_UNPART: - case GGML_OP_MAP_UNARY: - case GGML_OP_MAP_BINARY: - case GGML_OP_MAP_CUSTOM1: - case GGML_OP_MAP_CUSTOM2: - case GGML_OP_MAP_CUSTOM3: - { - node->n_tasks = 1; - } break; - case GGML_OP_CROSS_ENTROPY_LOSS: - { - node->n_tasks = n_threads; - - size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks); - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_CROSS_ENTROPY_LOSS_BACK: - { - node->n_tasks = n_threads; - - size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks; - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_NONE: - { - node->n_tasks = 1; - } break; - case GGML_OP_COUNT: - { - GGML_ASSERT(false); - } break; - } - } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1: + case GGML_OP_MAP_CUSTOM2: + case GGML_OP_MAP_CUSTOM3: + { + n_tasks = 1; + } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + n_tasks = n_threads; + + size_t cur = ggml_type_size(node->type)*(n_tasks + node->src0->ne[0]*n_tasks); + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + n_tasks = n_threads; - if (cgraph->work != NULL && work_size > cgraph->work_size) { - GGML_ASSERT(false); // TODO: better handling + size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks; + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_NONE: + { + n_tasks = 1; + } break; + case GGML_OP_COUNT: + { + GGML_ASSERT(false); + } break; } - if (work_size > 0 && cgraph->work == NULL) { - cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1); + cplan.n_tasks[i] = n_tasks; + } + + if (work_size > 0) { + work_size += CACHE_LINE_SIZE*(n_threads - 1); + } + + cplan.n_threads = n_threads; + cplan.work_size = work_size; + cplan.work_data = NULL; + + return cplan; +} + +void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { + { + GGML_ASSERT(cplan); + GGML_ASSERT(cplan->n_threads > 0); + + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); + } - GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); - cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size); + for (int i = 0; i < cgraph->n_nodes; ++i) { + if (cgraph->nodes[i]->op != GGML_OP_NONE) { + GGML_ASSERT(cplan->n_tasks[i] > 0); + } } } + const int n_threads = cplan->n_threads; + + struct ggml_compute_state_shared state_shared = { + /*.cgraph =*/ cgraph, + /*.cgraph_plan =*/ cplan, + /*.perf_node_start_cycles =*/ 0, + /*.perf_node_start_time_us =*/ 0, + /*.n_threads =*/ n_threads, + /*.n_active =*/ n_threads, + /*.node_n =*/ -1, + }; + struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + // create thread pool if (n_threads > 1) { for (int j = 1; j < n_threads; ++j) { @@ -16473,6 +16497,17 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { } } +void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { + struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); + + struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size); + GGML_ASSERT(buf); + + cplan.work_data = buf->data; + + ggml_graph_compute(cgraph, &cplan); +} + struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * leaf = cgraph->leafs[i]; @@ -16511,14 +16546,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n", + fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", arg, ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], - tensor->n_tasks, tensor->data, tensor->name); } @@ -17254,9 +17288,6 @@ static enum ggml_opt_result ggml_opt_adam( struct ggml_cgraph * gb) { GGML_ASSERT(ggml_is_scalar(f)); - gf->n_threads = params.n_threads; - gb->n_threads = params.n_threads; - // these will store the parameters we want to optimize struct ggml_tensor * ps[GGML_MAX_PARAMS]; @@ -17303,7 +17334,8 @@ static enum ggml_opt_result ggml_opt_adam( // compute the function value ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); opt->adam.fx_prev = ggml_get_f32_1d(f, 0); opt->adam.fx_best = opt->adam.fx_prev; @@ -17383,7 +17415,8 @@ static enum ggml_opt_result ggml_opt_adam( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); const float fx = ggml_get_f32_1d(f, 0); @@ -17505,7 +17538,8 @@ static enum ggml_opt_result linesearch_backtracking( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params->n_threads); ggml_opt_get_grad(np, ps, g); @@ -17573,9 +17607,6 @@ static enum ggml_opt_result ggml_opt_lbfgs( } } - gf->n_threads = params.n_threads; - gb->n_threads = params.n_threads; - const int m = params.lbfgs.m; // these will store the parameters we want to optimize @@ -17627,7 +17658,8 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); ggml_opt_get_grad(np, ps, g); diff --git a/ggml.h b/ggml.h index d0710c5559170..ab84bef68747e 100644 --- a/ggml.h +++ b/ggml.h @@ -65,7 +65,7 @@ // ggml_set_f32(a, 3.0f); // ggml_set_f32(b, 4.0f); // -// ggml_graph_compute(ctx0, &gf); +// ggml_graph_compute_with_ctx(ctx, &gf, n_threads); // // printf("f = %f\n", ggml_get_f32_1d(f, 0)); // @@ -418,9 +418,6 @@ extern "C" { struct ggml_tensor * src1; struct ggml_tensor * opt[GGML_MAX_OPT]; - // thread scheduling - int n_tasks; - // performance int perf_runs; int64_t perf_cycles; @@ -432,19 +429,27 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[4]; + char padding[8]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + // the compute plan that needs to be prepared for ggml_graph_compute() + // since https://github.com/ggerganov/ggml/issues/287 + struct ggml_cplan { + size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` + uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` + + int n_threads; + + // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes + int n_tasks[GGML_MAX_NODES]; + }; + // computation graph struct ggml_cgraph { int n_nodes; int n_leafs; - int n_threads; - - size_t work_size; - struct ggml_tensor * work; struct ggml_tensor * nodes[GGML_MAX_NODES]; struct ggml_tensor * grads[GGML_MAX_NODES]; @@ -1290,15 +1295,22 @@ extern "C" { GGML_API void ggml_set_param( struct ggml_context * ctx, - struct ggml_tensor * tensor); + struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); - GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); - GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + // ggml_graph_plan() has to be called before ggml_graph_compute() + // when plan.work_size > 0, caller must allocate memory for plan.work_data + GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + + // same as ggml_graph_compute() but the work data is allocated as a part of the context + // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data + GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); diff --git a/llama.cpp b/llama.cpp index 02afdeb14078f..ee6ec0920fc9c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -79,6 +79,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default (void) tensor; } +// +// ggml helpers +// + +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + +// +// memory sizes +// + static const std::map & MEM_REQ_SCRATCH0() { static std::map k_sizes = { @@ -321,6 +340,9 @@ struct llama_context { // input embedding (1-dimensional array: [n_embd]) std::vector embedding; + // reusable buffer for `struct ggml_graph_plan.work_data` + std::vector work_buffer; + // memory buffers used to evaluate the model // TODO: move in llama_state llama_ctx_buffer buf_compute; @@ -758,7 +780,6 @@ struct llama_model_loader { }; - // // kv cache // @@ -1265,7 +1286,7 @@ static bool llama_eval_internal( const float * embd, const int n_tokens, const int n_past, - const int n_threads, + int n_threads, const char * cgraph_fname) { LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); @@ -1306,10 +1327,11 @@ static bool llama_eval_internal( struct ggml_context * ctx0 = ggml_init(params); + ggml_cgraph gf = {}; + // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - ggml_cgraph gf = {}; - gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -1593,6 +1615,7 @@ static bool llama_eval_internal( #ifdef GGML_USE_METAL if (lctx.ctx_metal && N == 1) { + ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_graph_compute(lctx.ctx_metal, &gf); ggml_metal_get_tensor (lctx.ctx_metal, cur); } else { @@ -1612,10 +1635,10 @@ static bool llama_eval_internal( ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v); } - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads); } #else - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads); #endif if (cgraph_fname) { @@ -2575,8 +2598,8 @@ void llama_free_model(struct llama_model * model) { } struct llama_context * llama_new_context_with_model( - struct llama_model * model, - struct llama_context_params params) { + struct llama_model * model, + struct llama_context_params params) { if (!model) { return nullptr; @@ -2645,7 +2668,7 @@ struct llama_context * llama_new_context_with_model( #ifdef GGML_USE_METAL if (params.n_gpu_layers > 0) { // this allocates all Metal resources and memory buffers - ctx->ctx_metal = ggml_metal_init(); + ctx->ctx_metal = ggml_metal_init(1); void * data_ptr = NULL; size_t data_size = 0; @@ -2802,6 +2825,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const // read tensors and apply bool warned = false; int n_tensors = 0; + + std::vector work_buffer; + while (true) { int32_t n_dims; int32_t length; @@ -2966,8 +2992,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const } struct ggml_cgraph gf = ggml_build_forward(r); - gf.n_threads = n_threads; - ggml_graph_compute(lora_ctx, &gf); + + ggml_graph_compute_helper(work_buffer, &gf, n_threads); // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); @@ -3120,7 +3146,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); ggml_cgraph gf{}; - gf.n_threads = 1; ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); kout3d->data = out; @@ -3140,7 +3165,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); - ggml_graph_compute(cpy_ctx, &gf); + ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); ggml_free(cpy_ctx); } @@ -3226,7 +3251,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); ggml_cgraph gf{}; - gf.n_threads = 1; ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); kin3d->data = (void *) inp; @@ -3246,7 +3270,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); - ggml_graph_compute(cpy_ctx, &gf); + ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); ggml_free(cpy_ctx); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4171c126c7b7d..1acf050a743e4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp) llama_add_test(test-quantize-perf.cpp) llama_add_test(test-sampling.cpp) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) -# llama_add_test(test-grad0.c) # SLOW +llama_add_test(test-grad0.c) # SLOW # llama_add_test(test-opt.c) # SLOW diff --git a/tests/test-grad0.c b/tests/test-grad0.c index a3e25214b84eb..da4001ce5269f 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -10,6 +10,8 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +#pragma GCC diagnostic ignored "-Wdouble-promotion" + #define MAX_NARGS 3 #undef MIN @@ -49,7 +51,7 @@ float frand(void) { int irand(int n) { if (n == 0) return 0; - else return rand()%n; + return rand()%n; } void get_random_dims(int64_t * dims, int ndims) { @@ -159,12 +161,14 @@ struct ggml_tensor * get_random_tensor_int( float get_element(const struct ggml_tensor * t, int idx) { if (t->type == GGML_TYPE_F32) { return ((float *)t->data)[idx]; - } else if (t->type == GGML_TYPE_I32) { + } + + if (t->type == GGML_TYPE_I32) { return ((int32_t *)t->data)[idx]; - } else { - assert(false); - return INFINITY; } + + assert(false); + return INFINITY; } void set_element(struct ggml_tensor * t, int idx, float value) { @@ -215,15 +219,14 @@ bool check_gradient( } struct ggml_cgraph gf = ggml_build_forward (f); - gf.n_threads = n_threads; - struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); - gb.n_threads = n_threads; - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + ggml_graph_reset (&gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot"); // ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot"); @@ -236,15 +239,16 @@ bool check_gradient( const float xm = x0 - eps; const float xp = x0 + eps; set_element(x[i], k, xp); - ggml_graph_compute(ctx0, &gf); + + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); const float f0 = ggml_get_f32_1d(f, 0); set_element(x[i], k, xm); - ggml_graph_compute(ctx0, &gf); - const float f1 = ggml_get_f32_1d(f, 0); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + const float f1 = ggml_get_f32_1d(f, 0); const float g0 = (f0 - f1)/(2.0f*eps); set_element(x[i], k, x0); @@ -252,12 +256,13 @@ bool check_gradient( // compute gradient using backward graph ggml_graph_reset (&gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); const float g1 = get_element(x[i]->grad, k); const float error_abs = fabsf(g0 - g1); - const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0; + const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0; if (error_abs > max_error_abs || error_rel > max_error_rel) { printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n", diff --git a/tests/test-opt.c b/tests/test-opt.c index d001615ee353b..e928a7df7ee68 100644 --- a/tests/test-opt.c +++ b/tests/test-opt.c @@ -7,6 +7,7 @@ #define MAX_NARGS 2 +#pragma GCC diagnostic ignored "-Wdouble-promotion" // // logging @@ -33,7 +34,7 @@ #define GGML_PRINT(...) printf(__VA_ARGS__) -float frand() { +float frand(void) { return (float)rand()/(float)RAND_MAX; } @@ -114,7 +115,7 @@ void set_element(struct ggml_tensor * t, int idx, float value) { ((float *)t->data)[idx] = value; } -int main(int argc, const char ** argv) { +int main(void) { struct ggml_init_params params = { .mem_size = 1024*1024*1024, .mem_buffer = NULL, @@ -137,10 +138,11 @@ int main(int argc, const char ** argv) { struct ggml_tensor * d = ggml_sub(ctx, c, ab); struct ggml_tensor * e = ggml_sum(ctx, ggml_sqr(ctx, d)); - struct ggml_cgraph ge = ggml_build_forward(e); - ggml_graph_reset (&ge); - ggml_graph_compute(ctx, &ge); + ggml_graph_reset(&ge); + + ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1); + const float fe = ggml_get_f32_1d(e, 0); printf("%s: e = %.4f\n", __func__, fe); @@ -148,8 +150,10 @@ int main(int argc, const char ** argv) { ggml_opt(ctx, opt_params, e); - ggml_graph_reset (&ge); - ggml_graph_compute(ctx, &ge); + ggml_graph_reset(&ge); + + ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1); + const float fe_opt = ggml_get_f32_1d(e, 0); printf("%s: original e = %.4f\n", __func__, fe); printf("%s: optimized e = %.4f\n", __func__, fe_opt); From a7e20edf2266169ccd97a4eb949a593d628fbd64 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 7 Jul 2023 21:23:57 +0300 Subject: [PATCH 06/34] ci : switch threads to 1 (#2138) --- .github/workflows/build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a576139efd0ee..f6a2dd6daa198 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,6 +19,7 @@ env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GGML_NLOOP: 3 GGML_NITER: 1 + GGML_N_THREADS: 1 jobs: ubuntu-focal-make: From 84525e7962bee0abef91108948bbf7f7bfdcf421 Mon Sep 17 00:00:00 2001 From: dylan Date: Fri, 7 Jul 2023 11:25:25 -0700 Subject: [PATCH 07/34] docker : add support for CUDA in docker (#1461) Co-authored-by: canardleteer Co-authored-by: Georgi Gerganov --- .devops/full-cuda.Dockerfile | 33 +++++++++++++++++++++++++++++++++ .devops/main-cuda.Dockerfile | 32 ++++++++++++++++++++++++++++++++ Makefile | 8 +++++++- README.md | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 .devops/full-cuda.Dockerfile create mode 100644 .devops/main-cuda.Dockerfile diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile new file mode 100644 index 0000000000000..e5fcb37d6fe7a --- /dev/null +++ b/.devops/full-cuda.Dockerfile @@ -0,0 +1,33 @@ +ARG UBUNTU_VERSION=22.04 + +# This needs to generally match the container host's environment. +ARG CUDA_VERSION=11.7.1 + +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} as build + +# Unless otherwise specified, we make a fat build. +ARG CUDA_DOCKER_ARCH=all + +RUN apt-get update && \ + apt-get install -y build-essential python3 python3-pip + +COPY requirements.txt requirements.txt + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} +# Enable cuBLAS +ENV LLAMA_CUBLAS=1 + +RUN make + +ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile new file mode 100644 index 0000000000000..30c01196ab520 --- /dev/null +++ b/.devops/main-cuda.Dockerfile @@ -0,0 +1,32 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG CUDA_VERSION=11.7.1 +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +# Target the CUDA runtime image +ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} as build + +# Unless otherwise specified, we make a fat build. +ARG CUDA_DOCKER_ARCH=all + +RUN apt-get update && \ + apt-get install -y build-essential + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} +# Enable cuBLAS +ENV LLAMA_CUBLAS=1 + +RUN make + +FROM ${BASE_CUDA_RUN_CONTAINER} as runtime + +COPY --from=build /app/main /main + +ENTRYPOINT [ "/main" ] diff --git a/Makefile b/Makefile index 71415664bd7e6..6068cbe7b0a0f 100644 --- a/Makefile +++ b/Makefile @@ -163,7 +163,12 @@ ifdef LLAMA_CUBLAS LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib OBJS += ggml-cuda.o NVCC = nvcc - NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native + NVCCFLAGS = --forward-unknown-to-host-compiler +ifdef CUDA_DOCKER_ARCH + NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) +else + NVCCFLAGS += -arch=native +endif # CUDA_DOCKER_ARCH ifdef LLAMA_CUDA_FORCE_DMMV NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV endif # LLAMA_CUDA_FORCE_DMMV @@ -187,6 +192,7 @@ ifdef LLAMA_CUDA_KQUANTS_ITER else NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 endif + ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ endif # LLAMA_CUBLAS diff --git a/README.md b/README.md index 863aef123ad9a..7953fd3a0e0e9 100644 --- a/README.md +++ b/README.md @@ -731,6 +731,38 @@ or with a light image: docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 ``` +### Docker With CUDA + +Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container. + +#### Building Locally + +```bash +docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile . +docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile . +``` + +You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. + +The defaults are: + +- `CUDA_VERSION` set to `11.7.1` +- `CUDA_DOCKER_ARCH` set to `all` + +The resulting images, are essentially the same as the non-CUDA images: + +1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. +2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. + +#### Usage + +After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag. + +```bash +docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 +docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 +``` + ### Contributing - Contributors can open PRs From 061f5f8d2109bb7adcbd40f1b456d887c5a1df25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 8 Jul 2023 00:25:15 +0200 Subject: [PATCH 08/34] CUDA: add __restrict__ to mul mat vec kernels (#2140) --- ggml-cuda.cu | 53 +++++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7965ff74111f7..ec41e3524a1ca 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -59,8 +59,8 @@ typedef float2 dfloat2; #endif //GGML_CUDA_DMMV_F16 typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); -typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream); -typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v); +typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream); +typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v); typedef void (*cpy_kernel_t)(const char * cx, char * cdst); typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); typedef void (*ggml_cuda_op_t)( @@ -131,7 +131,7 @@ typedef struct { } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding"); -typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs); +typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs); //================================= k-quants @@ -407,7 +407,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in //================================== k-quants -static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { +static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) { const int i = blockIdx.x; const block_q2_K * x = (const block_q2_K *) vx; @@ -440,7 +440,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { } -static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { +static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) { const int i = blockIdx.x; const block_q3_K * x = (const block_q3_K *) vx; @@ -504,7 +504,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t } #endif -static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { +static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) { const block_q4_K * x = (const block_q4_K *) vx; const int i = blockIdx.x; @@ -544,7 +544,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { #endif } -static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { +static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) { const block_q5_K * x = (const block_q5_K *) vx; const int i = blockIdx.x; @@ -590,7 +590,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { #endif } -static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { +static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) { const block_q6_K * x = (const block_q6_K *) vx; const int i = blockIdx.x; @@ -634,7 +634,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { #endif } -static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { +static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); @@ -742,7 +742,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float } } -static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { +static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { const int row = blockIdx.y*blockDim.y + threadIdx.y; if (row > nrows) return; @@ -846,7 +846,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float } } -static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { +static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { const int row = blockIdx.y*blockDim.y + threadIdx.y; if (row > nrows) return; @@ -949,7 +949,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float } } -static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) { +static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) { const int row = blockIdx.x; const int num_blocks_per_row = ncols / QK_K; @@ -1053,7 +1053,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float } } -static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { +static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); @@ -1171,7 +1171,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, v.y = x[ib + iqs + 1]; } -static __global__ void quantize_q8_1(const float * x, void * vy, const int k) { +static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { @@ -1207,7 +1207,7 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) { } template -static __global__ void dequantize_block(const void * vx, float * y, const int k) { +static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) { const int i = blockDim.x*blockIdx.x + 2*threadIdx.x; if (i >= k) { @@ -1227,7 +1227,7 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k) y[iybs + iqs + y_offset] = v.y; } -static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { +static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; @@ -1252,7 +1252,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons #endif // __CUDA_ARCH__ >= 600 } -static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { +static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; @@ -1277,7 +1277,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons #endif // __CUDA_ARCH__ >= 600 } -static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { +static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; @@ -1312,7 +1312,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons #endif // __CUDA_ARCH__ >= 600 } -static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { +static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; @@ -1346,7 +1346,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons #endif // __CUDA_ARCH__ >= 600 } -static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) { +static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; @@ -1366,7 +1366,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, cons } template -static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) { +static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) { const int row = blockIdx.y*blockDim.y + threadIdx.y; if (row >= nrows) { @@ -1404,7 +1404,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d } template -static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) { +static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) { // qk = quantized weights per x block // qr = number of quantized weights per data value in x block const int row = blockIdx.y*blockDim.y + threadIdx.y; @@ -1471,7 +1471,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, } } -static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) { +static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) { const half * x = (const half *) vx; const int row_x = blockDim.y*blockIdx.y + threadIdx.y; @@ -1518,7 +1518,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl } static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous - const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, + const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int row_stride_x, const int channel_stride_x) { const half * x = (const half *) vx; @@ -2355,10 +2355,7 @@ inline void ggml_cuda_op_mul_mat_vec( src0->type == GGML_TYPE_Q5_1 || src0->type == GGML_TYPE_Q8_0; - // The integer intrinsics used in mul_mat_vec_q are available with compute capability 6. - // However, they have bad performance with Pascal cards. - // Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q. - const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented; + const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 600 && mul_mat_vec_q_implemented; #endif if (use_mul_mat_vec_q) { From 185416884131b7f7330405e0a730197f5ccd3284 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 8 Jul 2023 20:31:49 +0800 Subject: [PATCH 09/34] This allows LLAMA models that were previously incompatible with K quants to function mostly as normal. This happens when a model has a vocab != 32000, e.g 32001 which means it's not divisible by 256 or 64. Since the problematic dimensions only apply for `tok_embeddings.weight` and `output.weight` (dimentions 4096 x n_vocab), we can simply quantize these layers to Q8_0 whereas the majority of the hidden layers are still K-quanted since they have compatible dimensions. --- llama.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index ee6ec0920fc9c..be48a4e185bc3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2428,15 +2428,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else { new_type = quantized_type; #ifdef GGML_USE_K_QUANTS + bool convert_incompatible_tensor = false; if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K || quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) { int nx = tensor.ne.at(0); int ny = tensor.ne.at(1); if (nx % QK_K != 0 || ny % QK_K != 0) { - fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K); - fprintf(stderr, "This is required to be able to use k-quants for now!\n"); - fprintf(stderr, "========================================================================================\n\n"); - throw std::runtime_error("Unsupported tensor size encountered\n"); + fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); + fprintf(stderr, "Q8_0 will be used for this tensor instead.\n"); + convert_incompatible_tensor = true; } } if (tensor.name == "output.weight") { @@ -2464,6 +2464,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; } + if(convert_incompatible_tensor) + { + new_type = GGML_TYPE_Q8_0; //fall back to Q8_0 instead of just failing. + } #endif float * f32_data; From 64639555ff93c8ead2b80becb49cc6b60aeac240 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 8 Jul 2023 20:01:44 +0200 Subject: [PATCH 10/34] Fixed OpenLLaMA 3b CUDA mul_mat_vec_q (#2144) --- ggml-cuda.cu | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index ec41e3524a1ca..fd36f179b6144 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -208,6 +208,7 @@ typedef struct { static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding"); #define WARP_SIZE 32 +#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses #define CUDA_ADD_BLOCK_SIZE 256 #define CUDA_MUL_BLOCK_SIZE 256 @@ -1171,7 +1172,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, v.y = x[ib + iqs + 1]; } -static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int k) { +static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { @@ -1180,10 +1181,10 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest block_q8_1 * y = (block_q8_1 *) vy; - const int ib = i / QK8_0; // block index - const int iqs = i % QK8_0; // quant index + const int ib = i / QK8_1; // block index + const int iqs = i % QK8_1; // quant index - const float xi = x[i]; + const float xi = i < ndata ? x[i] : 0.0f; float amax = fabsf(xi); float sum = xi; @@ -1714,9 +1715,9 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con rms_norm_f32<<>>(x, dst, ncols); } -static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) { +static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; - quantize_q8_1<<>>(x, vy, k); + quantize_q8_1<<>>(x, vy, ndata, k); } static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { @@ -2359,9 +2360,11 @@ inline void ggml_cuda_op_mul_mat_vec( #endif if (use_mul_mat_vec_q) { + int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1; + padded_row_size -= padded_row_size % MATRIX_ROW_PADDING; size_t as; - void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as); - quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main); + void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as); + quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main); switch (src0->type) { case GGML_TYPE_Q4_0: @@ -3105,7 +3108,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { int nrows = ggml_nrows(tensor); + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + ggml_backend backend = tensor->backend; struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; memset(extra, 0, sizeof(*extra)); @@ -3134,11 +3141,24 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { int64_t nrows_split = row_high - row_low; const size_t offset_split = row_low*nb1; - const size_t size = ggml_nbytes_split(tensor, nrows_split); + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) + * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); + } - void * buf; + char * buf; CUDA_CHECK(cudaMalloc(&buf, size)); - void * buf_host = (char*)data + offset_split; + char * buf_host = (char*)data + offset_split; + + // set padding to 0 to avoid possible NaN values + if (size > original_size) { + CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); + } + cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice); From 2492a53fd0d8372ecc67f49f07b581905175eea8 Mon Sep 17 00:00:00 2001 From: rankaiyx Date: Sun, 9 Jul 2023 15:38:42 +0800 Subject: [PATCH 11/34] readme : add more docs indexes (#2127) * Update README.md to add more docs indexes * Update README.md to add more docs indexes --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7953fd3a0e0e9..318632989d218 100644 --- a/README.md +++ b/README.md @@ -783,5 +783,10 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m / ### Docs -- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks) +- [main](./examples/main/README.md) +- [server](./examples/server/README.md) +- [embd-input](./examples/embd-input/README.md) +- [jeopardy](./examples/jeopardy/README.md) +- [BLIS](./docs/BLIS.md) - [Performance troubleshooting](./docs/token_generation_performance_tips.md) +- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks) From 3bbc1a11f04a9adc0d0e08c2940ba4d2978755ab Mon Sep 17 00:00:00 2001 From: clyang Date: Sun, 9 Jul 2023 16:12:20 +0800 Subject: [PATCH 12/34] ggml : fix buidling with Intel MKL but ask for "cblas.h" issue (#2104) (#2115) * Fix buidling with Intel MKL but ask for "cblas.h" issue * Use angle brackets to indicate the system library --- CMakeLists.txt | 3 +++ ggml.c | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index a2404548f90d4..eed7b1b7bed9e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -217,6 +217,9 @@ if (LLAMA_BLAS) message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") add_compile_options(${BLAS_LINKER_FLAGS}) add_compile_definitions(GGML_USE_OPENBLAS) + if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel")) + add_compile_definitions(GGML_BLAS_USE_MKL) + endif() set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) diff --git a/ggml.c b/ggml.c index 55b0aff03bf16..c10877a761dd8 100644 --- a/ggml.c +++ b/ggml.c @@ -247,7 +247,11 @@ inline static void* ggml_aligned_malloc(size_t size) { #include "ggml-opencl.h" #endif #elif defined(GGML_USE_OPENBLAS) +#if defined(GGML_BLAS_USE_MKL) +#include +#else #include +#endif #elif defined(GGML_USE_CUBLAS) #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) From 18780e0a5e17348236230bbe891901b9b5718709 Mon Sep 17 00:00:00 2001 From: JackJollimore <130917767+JackJollimore@users.noreply.github.com> Date: Sun, 9 Jul 2023 05:20:43 -0300 Subject: [PATCH 13/34] readme : update Termux instructions (#2147) The file pathing is significant when running models inside of Termux on Android devices. llama.cpp performance is improved with loading a .bin from the $HOME directory. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 318632989d218..daa71c2b939da 100644 --- a/README.md +++ b/README.md @@ -695,7 +695,7 @@ export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle. -Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script. +Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script. ### Docker From db4047ad5cd8eae04db3b2efe0245e69a376601a Mon Sep 17 00:00:00 2001 From: Nigel Bosch Date: Sun, 9 Jul 2023 03:56:18 -0500 Subject: [PATCH 14/34] main : escape prompt prefix/suffix (#2151) --- examples/common.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/common.cpp b/examples/common.cpp index 3278a064346b4..93159c6dfc6de 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -418,6 +418,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { if (escape_prompt) { process_escapes(params.prompt); + process_escapes(params.input_prefix); + process_escapes(params.input_suffix); } return true; From 1d1630996920f889cdc08de26cebf2415958540e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 9 Jul 2023 05:59:53 -0300 Subject: [PATCH 15/34] llama : remove "first token must be BOS" restriction (#2153) --- llama.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index ee6ec0920fc9c..a491f1c7e15c4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1291,12 +1291,6 @@ static bool llama_eval_internal( LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); - // enforce that the first token is BOS - if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) { - fprintf(stderr, "%s: first token must be BOS\n", __func__); - return false; - } - const int64_t t_start_us = ggml_time_us(); const int N = n_tokens; From 242f01e983955e5c7ae807033563dd9f1ed26f9c Mon Sep 17 00:00:00 2001 From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun, 9 Jul 2023 17:10:14 -0500 Subject: [PATCH 16/34] Add Multi-GPU CuBLAS support in the new GUI --- koboldcpp.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 53ffe41c331ef..1c0ee59c3b919 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -9,6 +9,7 @@ import os import argparse import json, sys, http.server, time, asyncio, socket, threading +import re from concurrent.futures import ThreadPoolExecutor stop_token_max = 10 @@ -764,21 +765,27 @@ def getfilename(var, text): quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 4, 50) quick_gpu_selector_label = makelabel(quick_tab, "GPU ID:", 3) quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly") + CUDA_quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3","All"], width=60, variable=gpu_choice_var, state="readonly") quick_lowvram_box = makecheckbox(quick_tab, "Low VRAM", lowvram_var, 5) - # hides gpu options when CLBlast is not chosen def changerunmode(a,b,c): index = runopts_var.get() if index == "Use CLBlast" or index == "Use CuBLAS": gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw") - gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") quick_gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw") - quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") + if index == "Use CLBlast": + gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") + quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") + elif index == "Use CuBLAS": + CUDA_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") + CUDA_quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") else: gpu_selector_label.grid_forget() gpu_selector_box.grid_forget() + CUDA_gpu_selector_box.grid_forget() quick_gpu_selector_label.grid_forget() quick_gpu_selector_box.grid_forget() + CUDA_quick_gpu_selector_box.grid_forget() if index == "Use CuBLAS": lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw") @@ -829,6 +836,7 @@ def changerunmode(a,b,c): gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 4, 50) gpu_selector_label = makelabel(hardware_tab, "GPU ID:", 3) gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly") + CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3", "All"], width=60, variable=gpu_choice_var, state="readonly") lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 5) # presets selector @@ -958,12 +966,18 @@ def switch_old_gui(): args.stream = stream.get()==1 args.smartcontext = smartcontext.get()==1 args.unbantokens = unbantokens.get()==1 + gpu_choice_str = gpu_choice_var.get() + if gpu_choice_str.isdigit(): + gpuchoiceidx = int(gpu_choice_var.get())-1 - gpuchoiceidx = int(gpu_choice_var.get())-1 if runopts_var.get() == runopts[1]: args.useclblast = [[0,0], [1,0], [0,1]][gpuchoiceidx] if runopts_var.get() == runopts[2]: - args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)] + if gpu_choice_str.lower() == "all": + args.usecublas = ["lowvram"] if lowvram_var.get() == 1 else ["normal"] + else: + args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)] + if gpulayers_var.get(): args.gpulayers = int(gpulayers_var.get()) if runopts_var.get()==runopts[3]: @@ -1329,7 +1343,7 @@ def main(args): compatgroup = parser.add_mutually_exclusive_group() compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true') compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) - compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires Nvidia GPU. Select lowvram to not allocate VRAM scratch buffer. Enter a number after to select a different main GPU.", nargs='*',metavar=('[lowvram|normal] [main GPU ID]'), choices=['normal', 'lowvram', '0', '1', '2']) + compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID]'), choices=['normal', 'lowvram', '0', '1', '2']) parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0) args = parser.parse_args() main(args) From f1014f3cc7d5226d0119888bef58d166a4adf731 Mon Sep 17 00:00:00 2001 From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon, 10 Jul 2023 00:26:40 -0500 Subject: [PATCH 17/34] remove unused .re --- koboldcpp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/koboldcpp.py b/koboldcpp.py index 1c0ee59c3b919..8450554757189 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -9,7 +9,6 @@ import os import argparse import json, sys, http.server, time, asyncio, socket, threading -import re from concurrent.futures import ThreadPoolExecutor stop_token_max = 10 From 28279200449987c8f2a72df75ca91f09b6e557fd Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 10 Jul 2023 18:23:25 +0800 Subject: [PATCH 18/34] fix compile errors, rwkv not working --- gpttype_adapter.cpp | 6 +++--- otherarch/gpt2_v3.cpp | 3 +-- otherarch/gptj_v3.cpp | 3 +-- otherarch/mpt_v3.cpp | 3 +-- otherarch/neox_v3.cpp | 3 +-- otherarch/rwkv_v3.cpp | 10 ++++------ otherarch/rwkv_v3.h | 4 ++-- 7 files changed, 13 insertions(+), 19 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index b876f84417e7d..5996911a1c637 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -563,7 +563,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in rwkv_ctx_v3->logits_out = (float *)malloc(logitbufsiz); rwkv_ctx_v3->state_in = nullptr; - bool testeval = rwkv_eval(rwkv_ctx_v3, 0, rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, rwkv_ctx_v3->logits_out); + bool testeval = rwkv_eval(rwkv_ctx_v3, params.n_threads, 0, rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, rwkv_ctx_v3->logits_out); if (!testeval) { printf("\nError: RWKV Init Eval Failed!\n"); @@ -1162,12 +1162,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { if(embd.size()>1) { - evalres = rwkv_eval_sequence(rwkv_ctx_v3, (uint32_t*)embd.data(), embd.size(), rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, rwkv_ctx_v3->logits_out); + evalres = rwkv_eval_sequence(rwkv_ctx_v3, params.n_threads, (uint32_t*)embd.data(), embd.size(), rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, rwkv_ctx_v3->logits_out); } else { bool ignoreLogits = (!startedsampling && ((int)embd_inp.size() > input_consumed + 2)); - evalres = rwkv_eval(rwkv_ctx_v3, embd[0], rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, ignoreLogits?nullptr:rwkv_ctx_v3->logits_out); + evalres = rwkv_eval(rwkv_ctx_v3, params.n_threads, embd[0], rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, ignoreLogits?nullptr:rwkv_ctx_v3->logits_out); } memcpy(logits.data(), rwkv_ctx_v3->logits_out, sizeof(float) * rwkv_vocab.size()); diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp index b507357c4555a..2e7806d3d27a3 100644 --- a/otherarch/gpt2_v3.cpp +++ b/otherarch/gpt2_v3.cpp @@ -447,7 +447,6 @@ bool gpt2_eval( struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -708,7 +707,7 @@ bool gpt2_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index be4ad60daece9..aeaa7bbd58378 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -445,7 +445,6 @@ bool gptj_eval( struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -620,7 +619,7 @@ bool gptj_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index e4cf99fd7410c..35006688d2b89 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -383,7 +383,6 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd)); @@ -543,7 +542,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); // std::cout << "Qcur" << std::endl; // print_tensor(Qcur); diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp index 29b2d06c8e27e..9c1ab254587eb 100644 --- a/otherarch/neox_v3.cpp +++ b/otherarch/neox_v3.cpp @@ -461,7 +461,6 @@ bool gpt_neox_eval( struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -639,7 +638,7 @@ bool gpt_neox_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/otherarch/rwkv_v3.cpp b/otherarch/rwkv_v3.cpp index 0ccf8d8fac1ce..2ef44dd1bbe63 100644 --- a/otherarch/rwkv_v3.cpp +++ b/otherarch/rwkv_v3.cpp @@ -1511,7 +1511,6 @@ struct rwkv_context * rwkv_new_context_impl(std::shared_ptrn_threads = n_threads; RWKV_ASSERT_NULL(RWKV_ERROR_GRAPH, rwkv_build_serial_graph( serial_graph.ctx.ctx, instance->model, @@ -1609,7 +1608,7 @@ void rwkv_get_outputs(const struct rwkv_context * ctx, float * state_out, float } } -bool rwkv_eval(struct rwkv_context * ctx, const uint32_t token, const float * state_in, float * state_out, float * logits_out) { +bool rwkv_eval(struct rwkv_context * ctx, const int n_threads, const uint32_t token, const float * state_in, float * state_out, float * logits_out) { ctx->last_error = RWKV_ERROR_NONE; const struct rwkv_file_header & header = ctx->instance->model.header; @@ -1628,13 +1627,13 @@ bool rwkv_eval(struct rwkv_context * ctx, const uint32_t token, const float * st ctx->serial_graph.cgraph->n_leafs = ctx->serial_graph.post_logits_leafs; } - ggml_graph_compute(ctx->serial_graph.ctx.ctx, ctx->serial_graph.cgraph.get()); + ggml_graph_compute_with_ctx(ctx->serial_graph.ctx.ctx, ctx->serial_graph.cgraph.get(),n_threads); rwkv_get_outputs(ctx, state_out, logits_out); return true; } -bool rwkv_eval_sequence(struct rwkv_context * ctx, const uint32_t * sequence, const size_t sequence_len, const float * state_in, float * state_out, float * logits_out) { +bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const uint32_t * sequence, const size_t sequence_len, const float * state_in, float * state_out, float * logits_out) { ctx->last_error = RWKV_ERROR_NONE; const struct rwkv_file_header & header = ctx->instance->model.header; @@ -1690,7 +1689,6 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const uint32_t * sequence, co sequence_graph.tokens = ggml_new_tensor_1d(sequence_graph.ctx.ctx, GGML_TYPE_I32, sequence_len); sequence_graph.cgraph.reset(new(std::nothrow) struct ggml_cgraph()); RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, sequence_graph.cgraph, "Failed to allocate sequence graph"); - sequence_graph.cgraph->n_threads = 1; RWKV_ASSERT_FALSE(RWKV_ERROR_GRAPH, rwkv_build_sequence_graph( sequence_graph.ctx.ctx, ctx->instance->model, @@ -1717,7 +1715,7 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const uint32_t * sequence, co ctx->sequence_graph.cgraph->n_leafs = ctx->sequence_graph.post_logits_leafs; } - ggml_graph_compute(ctx->sequence_graph.ctx.ctx, ctx->sequence_graph.cgraph.get()); + ggml_graph_compute_with_ctx(ctx->sequence_graph.ctx.ctx, ctx->sequence_graph.cgraph.get(),n_threads); rwkv_get_outputs(ctx, state_out, logits_out); } diff --git a/otherarch/rwkv_v3.h b/otherarch/rwkv_v3.h index b24812fc2ff98..b9e0d57e2d6c7 100644 --- a/otherarch/rwkv_v3.h +++ b/otherarch/rwkv_v3.h @@ -111,7 +111,7 @@ extern "C" { // - state_in: FP32 buffer of size rwkv_get_state_len(); or NULL, if this is a first pass. // - state_out: FP32 buffer of size rwkv_get_state_len(). This buffer will be written to if non-NULL. // - logits_out: FP32 buffer of size rwkv_get_logits_len(). This buffer will be written to if non-NULL. - RWKV_API bool rwkv_eval(struct rwkv_context * ctx, const uint32_t token, const float * state_in, float * state_out, float * logits_out); + RWKV_API bool rwkv_eval(struct rwkv_context *, const int n_threads, const uint32_t token, const float * state_in, float * state_out, float * logits_out); // Evaluates the model for a sequence of tokens. // Uses a faster algorithm than rwkv_eval if you do not need the state and logits for every token. Best used with batch sizes of 64 or so. @@ -125,7 +125,7 @@ extern "C" { // - state_in: FP32 buffer of size rwkv_get_state_len(), or NULL if this is a first pass. // - state_out: FP32 buffer of size rwkv_get_state_len(). This buffer will be written to if non-NULL. // - logits_out: FP32 buffer of size rwkv_get_logits_len(). This buffer will be written to if non-NULL. - RWKV_API bool rwkv_eval_sequence(struct rwkv_context * ctx, const uint32_t * tokens, size_t sequence_len, const float * state_in, float * state_out, float * logits_out); + RWKV_API bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const uint32_t * tokens, size_t sequence_len, const float * state_in, float * state_out, float * logits_out); // Returns the number of tokens in the given model's vocabulary. // Useful for telling 20B_tokenizer models (n_vocab = 50277) apart from World models (n_vocab = 65536). From 523fc3be523ecacc179c8ef5b87c38eb9f0cc47a Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 10 Jul 2023 20:05:53 +0800 Subject: [PATCH 19/34] fixed rwkv, standardized new ctx usage --- .gitignore | 4 +++- otherarch/gpt2_v3.cpp | 2 +- otherarch/gptj_v3.cpp | 2 +- otherarch/mpt_v3.cpp | 2 +- otherarch/neox_v3.cpp | 2 +- otherarch/rwkv_v3.cpp | 7 +++++-- otherarch/utils.cpp | 12 ++++++++++++ otherarch/utils.h | 4 +++- 8 files changed, 27 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index c79b78b577793..622f5dd67297c 100644 --- a/.gitignore +++ b/.gitignore @@ -67,4 +67,6 @@ koboldcpp_failsafe.dll koboldcpp_openblas.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll -koboldcpp_cublas.dll \ No newline at end of file +koboldcpp_cublas.dll +cublas64_11.dll +cublasLt64_11.dll \ No newline at end of file diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp index 2e7806d3d27a3..608a61ac226d7 100644 --- a/otherarch/gpt2_v3.cpp +++ b/otherarch/gpt2_v3.cpp @@ -707,7 +707,7 @@ bool gpt2_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + kcpp_graph_compute_helper(&gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index aeaa7bbd58378..46f1ad0642dff 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -619,7 +619,7 @@ bool gptj_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + kcpp_graph_compute_helper(&gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index 35006688d2b89..211464f89c4d8 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -542,7 +542,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + kcpp_graph_compute_helper(&gf, n_threads); // std::cout << "Qcur" << std::endl; // print_tensor(Qcur); diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp index 9c1ab254587eb..d8ccaa9b624c3 100644 --- a/otherarch/neox_v3.cpp +++ b/otherarch/neox_v3.cpp @@ -638,7 +638,7 @@ bool gpt_neox_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + kcpp_graph_compute_helper(&gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/otherarch/rwkv_v3.cpp b/otherarch/rwkv_v3.cpp index 2ef44dd1bbe63..3bdf221fdfb81 100644 --- a/otherarch/rwkv_v3.cpp +++ b/otherarch/rwkv_v3.cpp @@ -13,6 +13,8 @@ #include "ggml-opencl.h" #endif +#include "utils.h" + #include #include #include @@ -729,6 +731,7 @@ struct rwkv_context { float * logits_out = 0; //stores address of output logit buffer size_t gpu_layers; + std::vector work_buffer; }; // https://stackoverflow.com/a/6458689 @@ -1627,7 +1630,7 @@ bool rwkv_eval(struct rwkv_context * ctx, const int n_threads, const uint32_t to ctx->serial_graph.cgraph->n_leafs = ctx->serial_graph.post_logits_leafs; } - ggml_graph_compute_with_ctx(ctx->serial_graph.ctx.ctx, ctx->serial_graph.cgraph.get(),n_threads); + kcpp_graph_compute_helper(ctx->serial_graph.cgraph.get(),n_threads); rwkv_get_outputs(ctx, state_out, logits_out); return true; @@ -1715,7 +1718,7 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const ui ctx->sequence_graph.cgraph->n_leafs = ctx->sequence_graph.post_logits_leafs; } - ggml_graph_compute_with_ctx(ctx->sequence_graph.ctx.ctx, ctx->sequence_graph.cgraph.get(),n_threads); + kcpp_graph_compute_helper(ctx->sequence_graph.cgraph.get(),n_threads); rwkv_get_outputs(ctx, state_out, logits_out); } diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp index 02637069a9b98..16e015c841b35 100644 --- a/otherarch/utils.cpp +++ b/otherarch/utils.cpp @@ -221,4 +221,16 @@ bool should_transpose_layer(std::string name) return true; } return false; +} + +static std::vector kcpp_compute_buf; +void kcpp_graph_compute_helper(ggml_cgraph *graph, int n_threads) +{ + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + if (plan.work_size > 0) + { + kcpp_compute_buf.resize(plan.work_size); + plan.work_data = kcpp_compute_buf.data(); + } + ggml_graph_compute(graph, &plan); } \ No newline at end of file diff --git a/otherarch/utils.h b/otherarch/utils.h index f9857823faf34..cbd7bfb5117ad 100644 --- a/otherarch/utils.h +++ b/otherarch/utils.h @@ -54,4 +54,6 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri -bool should_transpose_layer(std::string name); \ No newline at end of file +bool should_transpose_layer(std::string name); + +void kcpp_graph_compute_helper(ggml_cgraph * graph, int n_threads); \ No newline at end of file From 9324cb804a45b351cb2deba46501e7af6372b30b Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 10 Jul 2023 22:49:27 +0800 Subject: [PATCH 20/34] reimplemented save and load --- klite.embd | 12 +++-- koboldcpp.py | 138 +++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 123 insertions(+), 27 deletions(-) diff --git a/klite.embd b/klite.embd index dbbe8a99cb08e..c99e0341083e4 100644 --- a/klite.embd +++ b/klite.embd @@ -1,6 +1,6 @@