diff --git a/llama.cpp/ggml-cuda.cu b/llama.cpp/ggml-cuda.cu index 2dff630229..7653f8c445 100644 --- a/llama.cpp/ggml-cuda.cu +++ b/llama.cpp/ggml-cuda.cu @@ -10884,8 +10884,8 @@ static ggml_cuda_device_info ggml_cuda_init() { // Workaround for a rocBLAS bug when using multiple graphics cards: // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 #ifndef GGML_USE_TINYBLAS - rocblas_initialize(); - CUDA_CHECK(cudaDeviceSynchronize()); + // rocblas_initialize(); // already called + // CUDA_CHECK(cudaDeviceSynchronize()); #endif #endif @@ -13507,7 +13507,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t GGML_ASSERT(stat == cudaSuccess); } // Launch graph + printf("cudaGraphLaunch begin\n"); CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream())); + printf("cudaGraphLaunch done\n"); #else graph_evaluated_or_captured = true; #endif // USE_CUDA_GRAPH diff --git a/llamafile/cuda.c b/llamafile/cuda.c index 4c22b4d31c..0834795e4c 100644 --- a/llamafile/cuda.c +++ b/llamafile/cuda.c @@ -559,7 +559,14 @@ static bool compile_amd_windows(const char *clangxx, const char *dso, const char (char *)offload_arch, "-Wno-ignored-attributes", "-D_CRT_SECURE_NO_WARNINGS", - COMMON_FLAGS, + "-DGGML_BUILD=1", + "-DGGML_SHARED=1", + "-DGGML_MULTIPLATFORM", + "-DGGML_CUDA_DMMV_X=32", + "-DK_QUANTS_PER_ITERATION=2", + "-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128", + "-DGGML_CUDA_MMV_Y=1", + "-DGGML_USE_TINYBLAS", "-o", (char *)tmpdso, (char *)src, @@ -571,10 +578,10 @@ static bool compile_amd_windows(const char *clangxx, const char *dso, const char "-amdgpu-early-inline-all=true", "-isystem", gc(xasprintf("%s/include", hip_path)), - BLAS_ONLY("-l"), - BLAS_ONLY(gc(xasprintf("%s/lib/hipblas.%s", hip_path, lib))), - BLAS_ONLY("-l"), - BLAS_ONLY(gc(xasprintf("%s/lib/rocblas.%s", hip_path, lib))), + /* BLAS_ONLY("-l"), */ + /* BLAS_ONLY(gc(xasprintf("%s/lib/hipblas.%s", hip_path, lib))), */ + /* BLAS_ONLY("-l"), */ + /* BLAS_ONLY(gc(xasprintf("%s/lib/rocblas.%s", hip_path, lib))), */ "-l", gc(xasprintf("%s/lib/amdhip64.%s", hip_path, lib)), "-lkernel32",