From 60404a831edd135565f8a152ef180baac41ba4d4 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Sat, 22 Jun 2024 08:58:42 -0700 Subject: [PATCH] Always use tinyBLAS with AMD GPUs on Windows When llamafile uses hipBLAS with ROCm SDK 5.7.1 on Windows10 the process crashes shortly after tokens start getting printed. This is possibly the worst heisenbug I've ever seen in my career. It seems to to crash in AMD code, in a separate thread, inside hipGraphicsUnregisterResource, when a vqmovdqu instruction is being executed. While this happens, cosmo's main thread is usually doing something like std::string and std::locale stuff which appears unrelated. Could possibly be related to C++ exceptions and thread-local storage. Using --tinyblas appears to make it go away, but I can't say for certain it has anything to do with hipBLAS, since it might simply not manifest itself, because the binary footprint, stack, or heap memory layout changed. Let's keep our fingers crossed that tinyBLAS will save us from this issue. Note also that no one else has reported the bug even though it's been impacting me for months. --- llama.cpp/ggml-cuda.cu | 6 ++++-- llamafile/cuda.c | 17 ++++++++++++----- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/llama.cpp/ggml-cuda.cu b/llama.cpp/ggml-cuda.cu index 2dff630229..7653f8c445 100644 --- a/llama.cpp/ggml-cuda.cu +++ b/llama.cpp/ggml-cuda.cu @@ -10884,8 +10884,8 @@ static ggml_cuda_device_info ggml_cuda_init() { // Workaround for a rocBLAS bug when using multiple graphics cards: // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 #ifndef GGML_USE_TINYBLAS - rocblas_initialize(); - CUDA_CHECK(cudaDeviceSynchronize()); + // rocblas_initialize(); // already called + // CUDA_CHECK(cudaDeviceSynchronize()); #endif #endif @@ -13507,7 +13507,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t GGML_ASSERT(stat == cudaSuccess); } // Launch graph + printf("cudaGraphLaunch begin\n"); CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream())); + printf("cudaGraphLaunch done\n"); #else graph_evaluated_or_captured = true; #endif // USE_CUDA_GRAPH diff --git a/llamafile/cuda.c b/llamafile/cuda.c index 4c22b4d31c..0834795e4c 100644 --- a/llamafile/cuda.c +++ b/llamafile/cuda.c @@ -559,7 +559,14 @@ static bool compile_amd_windows(const char *clangxx, const char *dso, const char (char *)offload_arch, "-Wno-ignored-attributes", "-D_CRT_SECURE_NO_WARNINGS", - COMMON_FLAGS, + "-DGGML_BUILD=1", + "-DGGML_SHARED=1", + "-DGGML_MULTIPLATFORM", + "-DGGML_CUDA_DMMV_X=32", + "-DK_QUANTS_PER_ITERATION=2", + "-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128", + "-DGGML_CUDA_MMV_Y=1", + "-DGGML_USE_TINYBLAS", "-o", (char *)tmpdso, (char *)src, @@ -571,10 +578,10 @@ static bool compile_amd_windows(const char *clangxx, const char *dso, const char "-amdgpu-early-inline-all=true", "-isystem", gc(xasprintf("%s/include", hip_path)), - BLAS_ONLY("-l"), - BLAS_ONLY(gc(xasprintf("%s/lib/hipblas.%s", hip_path, lib))), - BLAS_ONLY("-l"), - BLAS_ONLY(gc(xasprintf("%s/lib/rocblas.%s", hip_path, lib))), + /* BLAS_ONLY("-l"), */ + /* BLAS_ONLY(gc(xasprintf("%s/lib/hipblas.%s", hip_path, lib))), */ + /* BLAS_ONLY("-l"), */ + /* BLAS_ONLY(gc(xasprintf("%s/lib/rocblas.%s", hip_path, lib))), */ "-l", gc(xasprintf("%s/lib/amdhip64.%s", hip_path, lib)), "-lkernel32",