Tencent · nihui · Jan 19, 2024 · Jan 18, 2024 · Jan 18, 2024 · Jan 18, 2024
diff --git a/src/gpu.cpp b/src/gpu.cpp
@@ -333,6 +333,7 @@ class GpuInfoPrivate
 
     // cooperative matrix
     bool support_cooperative_matrix;
+    bool support_cooperative_matrix_8_8_16;
     bool support_cooperative_matrix_16_8_8;
     bool support_cooperative_matrix_16_8_16;
     bool support_cooperative_matrix_16_16_16;
@@ -646,6 +647,11 @@ bool GpuInfo::support_cooperative_matrix() const
     return d->support_cooperative_matrix;
 }
 
+bool GpuInfo::support_cooperative_matrix_8_8_16() const
+{
+    return d->support_cooperative_matrix_8_8_16;
+}
+
 bool GpuInfo::support_cooperative_matrix_16_8_8() const
 {
     return d->support_cooperative_matrix_16_8_8;
@@ -1783,6 +1789,7 @@ int create_gpu_instance(const char* driver_path)
         gpu_info.support_int8_arithmetic = false;
         gpu_info.support_ycbcr_conversion = false;
         gpu_info.support_cooperative_matrix = false;
+        gpu_info.support_cooperative_matrix_8_8_16 = false;
         gpu_info.support_cooperative_matrix_16_8_8 = false;
         gpu_info.support_cooperative_matrix_16_8_16 = false;
         gpu_info.support_cooperative_matrix_16_16_16 = false;
@@ -1938,6 +1945,13 @@ int create_gpu_instance(const char* driver_path)
                     const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
                     // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope);
 
+                    if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
+                            && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
+                            && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
+                            && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
+                    {
+                        gpu_info.support_cooperative_matrix_8_8_16 = true;
+                    }
                     if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
                             && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
                             && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
@@ -1987,6 +2001,13 @@ int create_gpu_instance(const char* driver_path)
                     const VkCooperativeMatrixPropertiesNV& cmp = properties[j];
                     // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope);
 
+                    if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
+                            && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
+                            && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
+                            && cmp.scope == VK_SCOPE_SUBGROUP_NV)
+                    {
+                        gpu_info.support_cooperative_matrix_8_8_16 = true;
+                    }
                     if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
                             && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
                             && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
@@ -2028,9 +2049,9 @@ int create_gpu_instance(const char* driver_path)
                   gpu_info.subgroup_size, gpu_info.support_subgroup_basic, gpu_info.support_subgroup_vote,
                   gpu_info.support_subgroup_ballot, gpu_info.support_subgroup_shuffle);
 
-        NCNN_LOGE("[%u %s]  fp16-matrix-16_8_8/16_8_16/16_16_16=%d/%d/%d", i, physicalDeviceProperties.deviceName,
-                  gpu_info.support_cooperative_matrix_16_8_8, gpu_info.support_cooperative_matrix_16_8_16,
-                  gpu_info.support_cooperative_matrix_16_16_16);
+        NCNN_LOGE("[%u %s]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName,
+                  gpu_info.support_cooperative_matrix_8_8_16, gpu_info.support_cooperative_matrix_16_8_8,
+                  gpu_info.support_cooperative_matrix_16_8_16, gpu_info.support_cooperative_matrix_16_16_16);
 
         gpu_info_index++;
     }

diff --git a/src/gpu.h b/src/gpu.h
@@ -272,6 +272,7 @@ class NCNN_EXPORT GpuInfo
 
     // cooperative matrix feature
     bool support_cooperative_matrix() const;
+    bool support_cooperative_matrix_8_8_16() const;
     bool support_cooperative_matrix_16_8_8() const;
     bool support_cooperative_matrix_16_8_16() const;
     bool support_cooperative_matrix_16_16_16() const;

diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp
@@ -80,10 +80,10 @@ void main()
     const int lxd16 = lx / 16; // 0 1
     const int lxm16 = lx % 16; // 0 1 2 3 .... 15
 
-    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0;
-    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1;
-    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2;
-    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3;
+    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0;
+    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1;
+    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2;
+    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3;
 
     if (bias_term == 1)
     {
@@ -93,17 +93,24 @@ void main()
         coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor);
         coopMatLoad(bias1, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
 
+#if NCNN_fp16_arithmetic
+        sum0 = bias0;
+        sum1 = bias0;
+        sum2 = bias1;
+        sum3 = bias1;
+#else
         sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
         sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
         sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
         sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
+#endif
     }
     else
     {
-        sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
-        sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
-        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
-        sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+        sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+        sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+        sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+        sum3 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
     }
 
     const int N = psc(c) / 4;
@@ -201,6 +208,12 @@ void main()
     if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 
+#if NCNN_fp16_arithmetic
+    coopMatStore(sum0, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum1, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum2, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum3, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+#else
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum0);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum1);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum2);
@@ -210,6 +223,7 @@ void main()
     coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+#endif
 
     barrier();
 

diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp
@@ -82,14 +82,14 @@ void main()
     const int lxd8 = lx / 8; // 0 1 2 3
     const int lxm8 = lx % 8; // 0 1 2 3 .... 7
 
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0;
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1;
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2;
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3;
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4;
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5;
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6;
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7;
 
     if (bias_term == 1)
     {
@@ -103,6 +103,16 @@ void main()
         coopMatLoad(bias2, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
         coopMatLoad(bias3, bias_data, gy + 6, 0, gl_CooperativeMatrixLayoutRowMajor);
 
+#if NCNN_fp16_arithmetic
+        sum0 = bias0;
+        sum1 = bias0;
+        sum2 = bias1;
+        sum3 = bias1;
+        sum4 = bias2;
+        sum5 = bias2;
+        sum6 = bias3;
+        sum7 = bias3;
+#else
         sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
         sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
         sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias1);
@@ -111,17 +121,18 @@ void main()
         sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias2);
         sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
         sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
+#endif
     }
     else
     {
-        sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-        sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-        sum3 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-        sum4 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-        sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-        sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-        sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum3 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum4 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum5 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum6 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum7 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
     }
 
     const int N = psc(c) / 2;
@@ -247,6 +258,16 @@ void main()
     if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 
+#if NCNN_fp16_arithmetic
+    coopMatStore(sum0, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum1, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum2, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum3, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum4, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum5, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum6, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum7, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+#else
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum0);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum1);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum2);
@@ -264,6 +285,7 @@ void main()
     coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+#endif
 
     barrier();
 

diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp
@@ -80,10 +80,17 @@ void main()
     const int lxd16 = lx / 16; // 0 1
     const int lxm16 = lx % 16; // 0 1 2 3 .... 15
 
+#if NCNN_fp16_arithmetic
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3;
+#else
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;
+#endif
 
     if (bias_term == 1)
     {
@@ -93,17 +100,31 @@ void main()
         coopMatLoadNV(bias0, bias_data, gy, 0, false);
         coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);
 
+#if NCNN_fp16_arithmetic
+        sum0 = bias0;
+        sum1 = bias0;
+        sum2 = bias1;
+        sum3 = bias1;
+#else
         sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
         sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
         sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
         sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
+#endif
     }
     else
     {
+#if NCNN_fp16_arithmetic
+        sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+#else
         sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
         sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
         sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
         sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+#endif
     }
 
     const int N = psc(c) / 4;
@@ -201,6 +222,12 @@ void main()
     if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 
+#if NCNN_fp16_arithmetic
+    coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
+    coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
+    coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
+    coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
+#else
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
@@ -210,6 +237,7 @@ void main()
     coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
     coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
     coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
+#endif
 
     barrier();