Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fine tune MUL_MAT, new threading (spin+wait/notify), speedup q_f32 BLAS by splitting COMPUTE stage #1632

Closed
wants to merge 24 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
213f133
initial
mqy Jun 14, 2023
1b041d7
threading test: improve readability at both codes and output
mqy Jun 14, 2023
48016f6
bulk refactored task profile to support complete fallback; enable tun…
mqy Jun 14, 2023
9106232
threading test: At github, Windows can take more than 20 seconds to s…
mqy Jun 14, 2023
bb590f1
Workrounnd to set node->backend
mqy Jun 15, 2023
7c05049
tunning: check GPU offloading before loading model
mqy Jun 15, 2023
21e9379
tunning: add f16, todo: f32 failed with CL
mqy Jun 15, 2023
5342dc0
tunning: support k_quants; disabled rope shapes (workaround); make ca…
mqy Jun 15, 2023
6b83a3e
try make CL run w/o tunning, but -ngl stucks no output. had to add ta…
mqy Jun 16, 2023
06b0082
bulk refactoring task profile and related to run CL GPU offloading.
mqy Jun 18, 2023
67bb367
typos
mqy Jun 18, 2023
2193ab6
fix cuda build error
mqy Jun 18, 2023
0ec4dab
fixed break and asssion from select; try fix cuda link error
mqy Jun 18, 2023
5abb8ae
fix warning
mqy Jun 18, 2023
5feefb3
threading: add suspend/resume APIs, so it's possible to run a thread …
mqy Jun 18, 2023
286c5b3
threadng: remove unnecessary spin lock/unlock from suspend/resume; ad…
mqy Jun 18, 2023
9872863
threading test: less loops to avoid timeout
mqy Jun 18, 2023
6609c22
fixed OP_OUT_PROD and OP_NONE
mqy Jun 18, 2023
65fd65e
tune: update readme
mqy Jun 19, 2023
44b831d
tune: extract ggml_mulmat_tune_bench_wrapper
mqy Jun 19, 2023
4d32b40
threading test: decrease a threshold value to avoid timeout
mqy Jun 19, 2023
cc8a375
threading: fix deadlock by reverting part of changes from commit 286c…
mqy Jun 19, 2023
aac7f7c
threading: try to fix a deadlock, also added critical deadlock detection
mqy Jun 19, 2023
08972d2
threading: removed feature wait_on_done to figure out causes of deadl…
mqy Jun 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ models/*
/server
/Pipfile
/libllama.so
/mulmat-tune
mqy marked this conversation as resolved.
Show resolved Hide resolved

build-info.h
arm_neon.h
Expand Down
9 changes: 9 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ option(LLAMA_K_QUANTS "llama: use k-quants"
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" OFF)
option(LLAMA_TUNE "llama: mulmat tune" ON)

#
# Build info header
Expand Down Expand Up @@ -214,6 +215,7 @@ if (LLAMA_BLAS)
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_OPENBLAS)
add_compile_definitions(GGML_BLAS_VENDOR="${LLAMA_BLAS_VENDOR}")
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})

Expand Down Expand Up @@ -276,6 +278,11 @@ if (LLAMA_METAL)
)
endif()

if (LLAMA_TUNE)
add_compile_definitions(GGML_USE_TUNE)
add_compile_definitions(GGML_TUNE_NDEBUG)
endif()

if (LLAMA_K_QUANTS)
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
add_compile_definitions(GGML_USE_K_QUANTS)
Expand Down Expand Up @@ -450,6 +457,8 @@ endif()

add_library(ggml OBJECT
ggml.c
ggml-threading.c
ggml-tune.c
ggml.h
${GGML_SOURCES_CUDA}
${GGML_SOURCES_OPENCL}
Expand Down
33 changes: 27 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple mulmat-tune

ifdef LLAMA_BUILD_SERVER
BUILD_TARGETS += server
Expand Down Expand Up @@ -47,7 +47,8 @@ endif
OPT = -O3
CFLAGS = -I. $(OPT) -std=c11 -fPIC
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
LDFLAGS =
# -lm fixed error: ggml.o: undefined reference to symbol 'tanhf@@GLIBC_2.2.5' from ubuntu 22.04
LDFLAGS = -lm

ifdef LLAMA_DEBUG
CFLAGS += -O0 -g
Expand Down Expand Up @@ -134,8 +135,7 @@ ifndef LLAMA_NO_K_QUANTS
endif

ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
# Mac Intel & M1 - include Accelerate framework.
ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_ACCELERATE
LDFLAGS += -framework Accelerate
Expand All @@ -145,10 +145,16 @@ endif # LLAMA_NO_ACCELERATE
ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
LDFLAGS += -lopenblas
ifeq ($(UNAME_S),Darwin)
# openblas installed with Homebew on macOS.
CFLAGS += -I/usr/local/opt/openblas/include
LDFLAGS += -L/usr/local/opt/openblas/lib
endif
endif # LLAMA_OPENBLAS

ifdef LLAMA_BLIS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
CFLAGS += -DGGML_BLAS_VENDOR="\"BLIS\""
LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS

Expand Down Expand Up @@ -225,11 +231,16 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif

ifdef LLAMA_NO_K_QUANTS
ifndef LLAMA_NO_K_QUANTS
k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS

ifndef LLAMA_NO_TUNE
CFLAGS += -DGGML_USE_TUNE #-DGGML_TUNE_NDEBUG
CXXFLAGS += -DGGML_USE_TUNE
endif

#
# Print build information
#
Expand All @@ -245,6 +256,8 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info )

OBJS += ggml-tune.o ggml-threading.o

#
# Build library
#
Expand All @@ -253,7 +266,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@

llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
$(CXX) $(CXXFLAGS) -c $< -o $@

ggml-threading.o: ggml-threading.c ggml.h
$(CC) $(CFLAGS) -c $< -o $@

ggml-tune.o: ggml-tune.c ggml.h
$(CC) $(CFLAGS) -c $< -o $@

common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c $< -o $@
Expand Down Expand Up @@ -298,6 +316,9 @@ server: examples/server/server.cpp examples/server/httplib.h examples/server/jso
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

mulmat-tune: examples/mulmat-tune/mulmat-tune.cpp build-info.h ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o mulmat-tune $(LDFLAGS)

build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh > $@.tmp
@if ! cmp -s $@.tmp $@; then \
Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ else()
add_subdirectory(baby-llama)
add_subdirectory(train-text-from-scratch)
add_subdirectory(simple)
add_subdirectory(mulmat-tune)
if (LLAMA_METAL)
add_subdirectory(metal)
endif()
Expand Down
22 changes: 22 additions & 0 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.mem_test = true;
} else if (arg == "--export") {
params.export_cgraph = true;
#ifdef GGML_USE_TUNE
} else if (arg == "--tune") {
params.tune = true;
} else if (arg == "--tune-file") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.tune_file = argv[i];
#endif // GGML_USE_TUNE
} else if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") {
Expand Down Expand Up @@ -425,6 +435,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
process_escapes(params.prompt);
}

#ifdef GGML_USE_TUNE
if (params.n_gpu_layers > 0) {
if (params.tune || !params.tune_file.empty()) {
fprintf(stderr, "[tune] error: tunning and GPU offloading cannot be used at the same time, abort.\n");
exit(1);
}
}
#endif
return true;
}

Expand Down Expand Up @@ -498,6 +516,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
#endif
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
#ifdef GGML_USE_TUNE
fprintf(stderr, " --tune mulmat tune enable. If tune-file is set then exit after bench\n");
fprintf(stderr, " --tune-file FILE mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n");
#endif
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
Expand Down
2 changes: 2 additions & 0 deletions examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ struct gpt_params {
bool mem_test = false; // compute maximum memory usage
bool export_cgraph = false; // export the computation graph
bool verbose_prompt = false; // print prompt tokens before generation
bool tune = false; // mulmat tune: enable
std::string tune_file = ""; // mulmat tune: data file
};

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
Expand Down
10 changes: 10 additions & 0 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,16 @@ int main(int argc, char ** argv) {
return 1;
}

#ifdef GGML_USE_TUNE
if (params.tune || !params.tune_file.empty()) {
bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
if (!ok || (params.tune && !params.tune_file.empty())) {
llama_free(ctx);
return ok? 0: 1;
}
}
#endif

// print system information
{
fprintf(stderr, "\n");
Expand Down
14 changes: 14 additions & 0 deletions examples/mulmat-tune/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
set(TARGET mulmat-tune)
add_executable(${TARGET} mulmat-tune.cpp)

if (XCODE OR MSVC)
set(MULMAT_TUNE_LIBS ggml)
else()
set(MULMAT_TUNE_LIBS ggml m)
endif()

target_link_libraries(${TARGET} PRIVATE ${MULMAT_TUNE_LIBS} ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()
Loading