diff --git a/Makefile b/Makefile index 2ad61559a11d9..9b79f4dae604c 100644 --- a/Makefile +++ b/Makefile @@ -160,8 +160,8 @@ ifdef LLAMA_HIPBLAS CC := $(ROCM_PATH)/llvm/bin/clang CXX := $(ROCM_PATH)/llvm/bin/clang++ GPU_TARGETS = gfx900 gfx906 gfx908 gfx90a gfx1030 - LLAMA_CUDA_DMMV_X ?= 128 - LLAMA_CUDA_DMMV_Y ?= 4 + LLAMA_CUDA_DMMV_X ?= 64 + LLAMA_CUDA_DMMV_Y ?= 2 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 diff --git a/expose.h b/expose.h index fa9c303138831..b74718eb98918 100644 --- a/expose.h +++ b/expose.h @@ -8,6 +8,7 @@ struct load_model_inputs const int max_context_length; const int batch_size; const bool f16_kv; + const bool low_vram; const char * executable_path; const char * model_filename; const char * lora_filename; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 2fa3d08e76aa5..864366c729e6b 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -371,6 +371,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in //llama_ctx_paran_parts = -1; llama_ctx_params.seed = -1; llama_ctx_params.f16_kv = inputs.f16_kv; + llama_ctx_params.low_vram = inputs.low_vram; llama_ctx_params.logits_all = false; llama_ctx_params.use_mmap = inputs.use_mmap; llama_ctx_params.use_mlock = inputs.use_mlock; diff --git a/koboldcpp.py b/koboldcpp.py index dca7d129ea4d7..6bdd858a1a0e9 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -16,6 +16,7 @@ class load_model_inputs(ctypes.Structure): ("max_context_length", ctypes.c_int), ("batch_size", ctypes.c_int), ("f16_kv", ctypes.c_bool), + ("low_vram", ctypes.c_bool), ("executable_path", ctypes.c_char_p), ("model_filename", ctypes.c_char_p), ("lora_filename", ctypes.c_char_p), @@ -150,6 +151,7 @@ def load_model(model_filename): inputs.batch_size = 8 inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten inputs.threads = args.threads + inputs.low_vram = args.lowvram inputs.blasthreads = args.blasthreads inputs.f16_kv = True inputs.use_mmap = (not args.nommap) @@ -646,7 +648,7 @@ def onDropdownChange(event): #load all the vars args.threads = int(threads_var.get()) args.gpulayers = int(gpu_layers_var.get()) - + args.stream = (stream.get()==1) args.smartcontext = (smartcontext.get()==1) args.launch = (launchbrowser.get()==1) @@ -861,6 +863,7 @@ def main(args): parser.add_argument("--hordeconfig", help="Sets the display model name to something else, for easy use on AI Horde. Optional additional parameters set the horde max genlength and max ctxlen.",metavar=('[hordename]', '[hordelength] [hordectx]'), nargs='+') compatgroup = parser.add_mutually_exclusive_group() compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true') + parser.add_argument("--lowvram", help="Do not keep scratch memory in VRAM for CUDA", action='store_true') compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using CLBlast. Requires CLBlast.",metavar=('[GPU layers]'), type=int, default=0) args = parser.parse_args()