From 4218641d971eb71fc6fb5b4b89606cff3d1a68b0 Mon Sep 17 00:00:00 2001 From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Fri, 15 Sep 2023 21:13:44 -0500 Subject: [PATCH] Separate CuBLAS/hipBLAS (#438) --- .gitignore | 4 ++++ CMakeLists.txt | 38 ++++++++++++++++++++++++++++---------- Makefile | 23 +++++++++++------------ koboldcpp.py | 35 ++++++++++++++++++++++++----------- 4 files changed, 67 insertions(+), 33 deletions(-) diff --git a/.gitignore b/.gitignore index 2fb1af29f9168..1a87853fd635f 100644 --- a/.gitignore +++ b/.gitignore @@ -84,3 +84,7 @@ tests/test-tokenizer-0 /koboldcpp_cublas.dll /cublas64_11.dll /cublasLt64_11.dll +/rocblas/ +rocblas.dll +hipblas.dll +koboldcpp_hipblas.so diff --git a/CMakeLists.txt b/CMakeLists.txt index 58474caa97f5f..3afc1461f82d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -124,7 +124,12 @@ if (LLAMA_CUBLAS) endif() if (LLAMA_HIPBLAS) - list(APPEND CMAKE_PREFIX_PATH /opt/rocm) + if (MSVC) + list(APPEND CMAKE_PREFIX_PATH "C:/Program Files/AMD/ROCm/5.5") + else() + list(APPEND CMAKE_PREFIX_PATH /opt/rocm) + endif() + if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") @@ -387,16 +392,29 @@ target_compile_features(gpttype_adapter PUBLIC cxx_std_11) # don't bump target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) +if (LLAMA_CUBLAS) + set(TARGET koboldcpp_cublas) + add_library(${TARGET} SHARED expose.cpp expose.h) + target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples ./common) + target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump + set_target_properties(${TARGET} PROPERTIES PREFIX "") + set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas") + set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${LLAMA_EXTRA_LIBS}) + target_compile_features(${TARGET} PRIVATE cxx_std_11) +endif() -set(TARGET koboldcpp_cublas) -add_library(${TARGET} SHARED expose.cpp expose.h) -target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples ./common) -target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump -set_target_properties(${TARGET} PROPERTIES PREFIX "") -set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas") -set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) -target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${LLAMA_EXTRA_LIBS}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +if (LLAMA_HIPBLAS) + set(TARGET koboldcpp_hipblas) + add_library(${TARGET} SHARED expose.cpp expose.h) + target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples ./common) + target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump + set_target_properties(${TARGET} PROPERTIES PREFIX "") + set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas") + set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${LLAMA_EXTRA_LIBS}) + target_compile_features(${TARGET} PRIVATE cxx_std_11) +endif() if (MAKE_MISC_FILES) diff --git a/Makefile b/Makefile index 4eb27e369cb1d..cadbeedb11786 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_cublas +default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_cublas koboldcpp_hipblas tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt dev: koboldcpp_openblas dev2: koboldcpp_clblast @@ -39,8 +39,8 @@ endif # # keep standard at C11 and C++11 -CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS +CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE +CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE LDFLAGS = # these are used on windows, to build some libraries with extra old device compatibility @@ -211,18 +211,15 @@ endif # LLAMA_CUDA_FORCE_DMMV ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \ -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \ -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \ - -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) \ - -DCC_TURING=1000000000 + -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) ggml_v2-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \ -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \ -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \ - -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) \ - -DCC_TURING=1000000000 + -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) ggml_v2-cuda-legacy.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \ -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \ -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \ - -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) \ - -DCC_TURING=1000000000 # DGGML_CUDA_DMMV_F16 does not currently work with AMD. + -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h @@ -417,7 +414,7 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER) $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ clean: - rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so + rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @@ -439,8 +436,10 @@ koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o com $(NOAVX2_BUILD) koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o $(OBJS) $(CLBLAST_BUILD) -koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(CUBLAS_OBJS) $(HIP_OBJS) $(OBJS) - $(CUBLAS_BUILD) $(HIPBLAS_BUILD) +koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(CUBLAS_OBJS) $(OBJS) + $(CUBLAS_BUILD) +koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(HIP_OBJS) $(OBJS) + $(HIPBLAS_BUILD) quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) diff --git a/koboldcpp.py b/koboldcpp.py index beb3fe16dd2e1..70b6e80708f09 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -103,6 +103,7 @@ def pick_existant_file(ntoption,nonntoption): lib_noavx2 = pick_existant_file("koboldcpp_noavx2.dll","koboldcpp_noavx2.so") lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so") lib_cublas = pick_existant_file("koboldcpp_cublas.dll","koboldcpp_cublas.so") +lib_hipblas = pick_existant_file("koboldcpp_hipblas.dll","koboldcpp_hipblas.so") def init_library(): @@ -113,6 +114,7 @@ def init_library(): use_openblas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir. use_clblast = False #uses CLBlast instead use_cublas = False #uses cublas instead + use_hipblas = False #uses hipblas instead use_noavx2 = False #uses no avx2 instructions use_failsafe = False #uses no intrinsics, failsafe mode if args.noavx2: @@ -131,11 +133,16 @@ def init_library(): print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.") use_clblast = True elif (args.usecublas is not None): - if not file_exists(lib_cublas): + if not file_exists(lib_cublas) and not file_exists(lib_hipblas): print("Warning: CuBLAS library file not found. Non-BLAS library will be used.") else: - print("Attempting to use CuBLAS library for faster prompt ingestion. A compatible CuBLAS will be required.") - use_cublas = True + if file_exists(lib_cublas): + print("Attempting to use CuBLAS library for faster prompt ingestion. A compatible CuBLAS will be required.") + use_cublas = True + elif file_exists(lib_hipblas): + print("Attempting to use hipBLAS library for faster prompt ingestion. A compatible AMD GPU will be required.") + use_hipblas = True + else: if not file_exists(lib_openblas) or (os.name=='nt' and not file_exists("libopenblas.dll")): print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.") @@ -157,6 +164,8 @@ def init_library(): libname = lib_clblast elif use_cublas: libname = lib_cublas + elif use_hipblas: + libname = lib_hipblas elif use_openblas: libname = lib_openblas else: @@ -766,10 +775,11 @@ def show_new_gui(): (lib_openblas, "Use OpenBLAS"), (lib_clblast, "Use CLBlast"), (lib_cublas, "Use CuBLAS"), + (lib_hipblas, "Use hipBLAS (ROCm)"), (lib_default, "Use No BLAS"), (lib_noavx2, "NoAVX2 Mode (Old CPU)"), (lib_failsafe, "Failsafe Mode (Old CPU)")] - openblas_option, clblast_option, cublas_option, default_option, noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs) + openblas_option, clblast_option, cublas_option, hipblas_option, default_option, noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs) # slider data blasbatchsize_values = ["-1", "32", "64", "128", "256", "512", "1024", "2048"] blasbatchsize_text = ["Don't Batch BLAS","32","64","128","256","512","1024","2048"] @@ -922,7 +932,7 @@ def setup_backend_tooltip(parent): def changerunmode(a,b,c): index = runopts_var.get() - if index == "Use CLBlast" or index == "Use CuBLAS": + if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw") quick_gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw") if index == "Use CLBlast": @@ -930,7 +940,7 @@ def changerunmode(a,b,c): quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") if gpu_choice_var.get()=="All": gpu_choice_var.set("1") - elif index == "Use CuBLAS": + elif index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": CUDA_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") CUDA_quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") else: @@ -941,7 +951,7 @@ def changerunmode(a,b,c): quick_gpu_selector_box.grid_forget() CUDA_quick_gpu_selector_box.grid_forget() - if index == "Use CuBLAS": + if index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw") quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw") mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw") @@ -952,7 +962,7 @@ def changerunmode(a,b,c): mmq_box.grid_forget() quick_mmq_box.grid_forget() - if index == "Use CLBlast" or index == "Use CuBLAS": + if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw") gpu_layers_entry.grid(row=5, column=1, padx=8, pady=1, stick="nw") quick_gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw") @@ -1147,7 +1157,7 @@ def export_vars(): gpuchoiceidx = int(gpu_choice_var.get())-1 if runopts_var.get() == "Use CLBlast": args.useclblast = [[0,0], [1,0], [0,1], [1,1]][gpuchoiceidx] - if runopts_var.get() == "Use CuBLAS": + if runopts_var.get() == "Use CuBLAS" or runopts_var.get() == "Use hipBLAS (ROCm)": if gpu_choice_var.get()=="All": args.usecublas = ["lowvram"] if lowvram_var.get() == 1 else ["normal"] else: @@ -1204,8 +1214,11 @@ def import_vars(dict): runopts_var.set(clblast_option) gpu_choice_var.set(str(["0 0", "1 0", "0 1", "1 1"].index(str(dict["useclblast"][0]) + " " + str(dict["useclblast"][1])) + 1)) elif "usecublas" in dict and dict["usecublas"]: - if cublas_option is not None: - runopts_var.set(cublas_option) + if cublas_option is not None or hipblas_option is not None: + if cublas_option: + runopts_var.set(cublas_option) + elif hipblas_option: + runopts_var.set(cublas_option) lowvram_var.set(1 if "lowvram" in dict["usecublas"] else 0) mmq_var.set(1 if "mmq" in dict["usecublas"] else 0) gpu_choice_var.set("All")