From 2df8d0353649df118cb73698b91101951a378383 Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Wed, 3 Jul 2024 23:02:58 +0000
Subject: [PATCH] Define and optimize  RDNA1 (#8085)

---
 ggml-cuda/common.cuh |  4 ++++
 ggml-cuda/mmq.cuh    | 10 +++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh
index 472f4ace1c2ad2..4ff06b8719d378 100644
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@@ -227,6 +227,10 @@ typedef float2 dfloat2;
 #define RDNA2
 #endif
 
+#if defined(__gfx1010__) || defined(__gfx1012__)
+#define RDNA1
+#endif
+
 #ifndef __has_builtin
     #define __has_builtin(x) 0
 #endif
diff --git a/ggml-cuda/mmq.cuh b/ggml-cuda/mmq.cuh
index 94d6c2c7e08d62..ffb3b4bd572a16 100644
--- a/ggml-cuda/mmq.cuh
+++ b/ggml-cuda/mmq.cuh
@@ -61,12 +61,16 @@ static constexpr __device__ int get_mmq_x_max_device() {
 }
 
 static constexpr int get_mmq_y_host(const int cc) {
-    return int8_mma_available(cc) || cc >= CC_VOLTA ? 128 : 64;
+    return cc >= CC_OFFSET_AMD ? (cc == CC_RDNA1 ? 64 : 128) : (cc >= CC_VOLTA ? 128 : 64);
 }
 
 static constexpr __device__ int get_mmq_y_device() {
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA1)
+    return 64;
+#else
     return 128;
+#endif // defined RDNA1
 #else
 #if __CUDA_ARCH__ >= CC_VOLTA
     return 128;
@@ -2400,9 +2404,9 @@ static __device__ void mul_mat_q_process_tile(
 
 template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
+#if defined(RDNA3) || defined(RDNA2) || defined(RDNA1)
     __launch_bounds__(WARP_SIZE*nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(RDNA3) || defined(RDNA2) || defined(RDNA1)
 #else
 #if __CUDA_ARCH__ >= CC_VOLTA
     __launch_bounds__(WARP_SIZE*nwarps, 1)