-
Notifications
You must be signed in to change notification settings - Fork 10.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CUDA: deduplicate mmq code #7397
CUDA: deduplicate mmq code #7397
Conversation
ggml-cuda/mmq.cu
Outdated
typedef struct mmq_arch_config_t { | ||
int x; | ||
int y; | ||
int nwarps; | ||
} mmq_arch_config_t; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The typedef struct stuff is not necessary in C++, just do struct mmq_arch_config_t { .. };
.
ggml-cuda/mmq.cu
Outdated
#if __CUDA_ARCH__ >= MIN_CC_DP4A | ||
constexpr mmq_config_t mmq_config = MMQ_CONFIG_Q4_0; | ||
|
||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) | ||
#if defined(RDNA3) || defined(RDNA2) | ||
const int mmq_x = MMQ_X_Q4_0_RDNA2; | ||
const int mmq_y = MMQ_Y_Q4_0_RDNA2; | ||
const int nwarps = NWARPS_Q4_0_RDNA2; | ||
constexpr mmq_arch_config_t arch_config = mmq_config.rdna2; | ||
#else | ||
const int mmq_x = MMQ_X_Q4_0_RDNA1; | ||
const int mmq_y = MMQ_Y_Q4_0_RDNA1; | ||
const int nwarps = NWARPS_Q4_0_RDNA1; | ||
constexpr mmq_arch_config_t arch_config = mmq_config.rdna1; | ||
#endif // defined(RDNA3) || defined(RDNA2) | ||
#else | ||
#if __CUDA_ARCH__ >= CC_VOLTA | ||
constexpr mmq_arch_config_t arch_config = mmq_config.ampere; | ||
#else | ||
constexpr mmq_arch_config_t arch_config = mmq_config.pascal; | ||
#endif // __CUDA_ARCH__ >= CC_VOLTA | ||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can use a constexpr
function to deduplicate this code, eg.
constexpr static __device__ mmq_arch_config_t get_mmq_arch_config(const mmq_config_t mmq_config) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
return mmq_config.rdna2;
#else
return mmq_config.rdna1;
#endif // defined(RDNA3) || defined(RDNA2)
#else
#if __CUDA_ARCH__ >= CC_VOLTA
return mmq_config.ampere;
#else
return mmq_config.pascal;
#endif // __CUDA_ARCH__ >= CC_VOLTA
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}
template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_0.rdna2.nwarps, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q4_0(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A
constexpr mmq_arch_config_t arch_config = get_mmq_arch_config(MMQ_CONFIG_Q4_0);
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_0<arch_config.y>,
load_tiles_q4_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
It also seems possible to use a constexpr function with constexpr static __device__ int get_config_launch_bounds(const mmq_config config) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
return WARP_SIZE*config.nwarps;
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
return 0;
}
template <bool need_check> static __global__ void
__launch_bounds__(get_config_launch_bounds(MMQ_CONFIG_Q4_0), 2)
mul_mat_q4_0(... C++11 constexpr functions only allow a |
70042db
to
d9a738c
Compare
I tried wrangling the code into producing the correct tile sizes for each arch without having to compile all tile sizes for all archs but after some time I gave up. I don't know how to correctly pass the templates for e.g. allocate tiles. For now I would like to just merge this PR as-is; I will soon look into how to utilize tensor cores with PTX at which point I will likely overhaul the MMQ kernels to be more like #4801 . At that point I plan to replace the current system for determining tile sizes anyways. |
|
Note, CI is failing due to ggml-org / ggml-4-x86-cuda-v100 - failure 2 in 0:15.28 but confirmed fixed a few commits later under CUDA: fix unused warning in mmq.cu (https://github.com/ggerganov/llama.cpp/pull/7442[)](https://github.com/ggerganov/llama.cpp/commit/fcf6538ba6702c55eaec70da9a75c81d04900a72) |
This PR deduplicates the CUDA code related to mul_mat_q, mostly the code around launching the kernels. There is still duplication around the
__global__
functions because I don't know how to handle__launch_bounds__
in a way that isn't an overly complicated and difficult to understand macro.