Skip to content

Commit

Permalink
report vulkan cm 8x8x16 config, enable fp16a cm (#5298)
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui authored Jan 19, 2024
1 parent 1012f85 commit 05b4dcb
Show file tree
Hide file tree
Showing 10 changed files with 223 additions and 39 deletions.
27 changes: 24 additions & 3 deletions src/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ class GpuInfoPrivate

// cooperative matrix
bool support_cooperative_matrix;
bool support_cooperative_matrix_8_8_16;
bool support_cooperative_matrix_16_8_8;
bool support_cooperative_matrix_16_8_16;
bool support_cooperative_matrix_16_16_16;
Expand Down Expand Up @@ -646,6 +647,11 @@ bool GpuInfo::support_cooperative_matrix() const
return d->support_cooperative_matrix;
}

bool GpuInfo::support_cooperative_matrix_8_8_16() const
{
return d->support_cooperative_matrix_8_8_16;
}

bool GpuInfo::support_cooperative_matrix_16_8_8() const
{
return d->support_cooperative_matrix_16_8_8;
Expand Down Expand Up @@ -1783,6 +1789,7 @@ int create_gpu_instance(const char* driver_path)
gpu_info.support_int8_arithmetic = false;
gpu_info.support_ycbcr_conversion = false;
gpu_info.support_cooperative_matrix = false;
gpu_info.support_cooperative_matrix_8_8_16 = false;
gpu_info.support_cooperative_matrix_16_8_8 = false;
gpu_info.support_cooperative_matrix_16_8_16 = false;
gpu_info.support_cooperative_matrix_16_16_16 = false;
Expand Down Expand Up @@ -1938,6 +1945,13 @@ int create_gpu_instance(const char* driver_path)
const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
// NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope);

if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
{
gpu_info.support_cooperative_matrix_8_8_16 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
Expand Down Expand Up @@ -1987,6 +2001,13 @@ int create_gpu_instance(const char* driver_path)
const VkCooperativeMatrixPropertiesNV& cmp = properties[j];
// NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope);

if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
{
gpu_info.support_cooperative_matrix_8_8_16 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
Expand Down Expand Up @@ -2028,9 +2049,9 @@ int create_gpu_instance(const char* driver_path)
gpu_info.subgroup_size, gpu_info.support_subgroup_basic, gpu_info.support_subgroup_vote,
gpu_info.support_subgroup_ballot, gpu_info.support_subgroup_shuffle);

NCNN_LOGE("[%u %s] fp16-matrix-16_8_8/16_8_16/16_16_16=%d/%d/%d", i, physicalDeviceProperties.deviceName,
gpu_info.support_cooperative_matrix_16_8_8, gpu_info.support_cooperative_matrix_16_8_16,
gpu_info.support_cooperative_matrix_16_16_16);
NCNN_LOGE("[%u %s] fp16-8x8x16/16x8x8/16x8x16/16x16x16=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName,
gpu_info.support_cooperative_matrix_8_8_16, gpu_info.support_cooperative_matrix_16_8_8,
gpu_info.support_cooperative_matrix_16_8_16, gpu_info.support_cooperative_matrix_16_16_16);

gpu_info_index++;
}
Expand Down
1 change: 1 addition & 0 deletions src/gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ class NCNN_EXPORT GpuInfo

// cooperative matrix feature
bool support_cooperative_matrix() const;
bool support_cooperative_matrix_8_8_16() const;
bool support_cooperative_matrix_16_8_8() const;
bool support_cooperative_matrix_16_8_16() const;
bool support_cooperative_matrix_16_16_16() const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,10 @@ void main()
const int lxd16 = lx / 16; // 0 1
const int lxm16 = lx % 16; // 0 1 2 3 .... 15

coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0;
coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1;
coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2;
coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3;
coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0;
coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1;
coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2;
coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3;

if (bias_term == 1)
{
Expand All @@ -93,17 +93,24 @@ void main()
coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor);
coopMatLoad(bias1, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);

#if NCNN_fp16_arithmetic
sum0 = bias0;
sum1 = bias0;
sum2 = bias1;
sum3 = bias1;
#else
sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
#endif
}
else
{
sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
sum3 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
}

const int N = psc(c) / 4;
Expand Down Expand Up @@ -201,6 +208,12 @@ void main()
if (gx >= psc(outcstep) || gy >= psc(outc))
return;

#if NCNN_fp16_arithmetic
coopMatStore(sum0, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum1, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum2, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum3, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
#else
coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum0);
coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum1);
coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum2);
Expand All @@ -210,6 +223,7 @@ void main()
coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
#endif

barrier();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,14 @@ void main()
const int lxd8 = lx / 8; // 0 1 2 3
const int lxm8 = lx % 8; // 0 1 2 3 .... 7

coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0;
coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1;
coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2;
coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3;
coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4;
coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5;
coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6;
coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7;

if (bias_term == 1)
{
Expand All @@ -103,6 +103,16 @@ void main()
coopMatLoad(bias2, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
coopMatLoad(bias3, bias_data, gy + 6, 0, gl_CooperativeMatrixLayoutRowMajor);

#if NCNN_fp16_arithmetic
sum0 = bias0;
sum1 = bias0;
sum2 = bias1;
sum3 = bias1;
sum4 = bias2;
sum5 = bias2;
sum6 = bias3;
sum7 = bias3;
#else
sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias1);
Expand All @@ -111,17 +121,18 @@ void main()
sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias2);
sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
#endif
}
else
{
sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum3 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum4 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum3 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum4 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum5 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum6 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum7 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
}

const int N = psc(c) / 2;
Expand Down Expand Up @@ -247,6 +258,16 @@ void main()
if (gx >= psc(outcstep) || gy >= psc(outc))
return;

#if NCNN_fp16_arithmetic
coopMatStore(sum0, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum1, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum2, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum3, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum4, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum5, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum6, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum7, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
#else
coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum0);
coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum1);
coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum2);
Expand All @@ -264,6 +285,7 @@ void main()
coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
#endif

barrier();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,17 @@ void main()
const int lxd16 = lx / 16; // 0 1
const int lxm16 = lx % 16; // 0 1 2 3 .... 15

#if NCNN_fp16_arithmetic
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0;
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1;
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2;
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3;
#else
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;
#endif

if (bias_term == 1)
{
Expand All @@ -93,17 +100,31 @@ void main()
coopMatLoadNV(bias0, bias_data, gy, 0, false);
coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);

#if NCNN_fp16_arithmetic
sum0 = bias0;
sum1 = bias0;
sum2 = bias1;
sum3 = bias1;
#else
sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
#endif
}
else
{
#if NCNN_fp16_arithmetic
sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
#else
sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
#endif
}

const int N = psc(c) / 4;
Expand Down Expand Up @@ -201,6 +222,12 @@ void main()
if (gx >= psc(outcstep) || gy >= psc(outc))
return;

#if NCNN_fp16_arithmetic
coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
#else
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
Expand All @@ -210,6 +237,7 @@ void main()
coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
#endif

barrier();

Expand Down
Loading

0 comments on commit 05b4dcb

Please sign in to comment.