Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

report vulkan cm 8x8x16 config, enable fp16a cm #5298

Merged
merged 3 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions src/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ class GpuInfoPrivate

// cooperative matrix
bool support_cooperative_matrix;
bool support_cooperative_matrix_8_8_16;
bool support_cooperative_matrix_16_8_8;
bool support_cooperative_matrix_16_8_16;
bool support_cooperative_matrix_16_16_16;
Expand Down Expand Up @@ -646,6 +647,11 @@ bool GpuInfo::support_cooperative_matrix() const
return d->support_cooperative_matrix;
}

bool GpuInfo::support_cooperative_matrix_8_8_16() const
{
return d->support_cooperative_matrix_8_8_16;
}

bool GpuInfo::support_cooperative_matrix_16_8_8() const
{
return d->support_cooperative_matrix_16_8_8;
Expand Down Expand Up @@ -1783,6 +1789,7 @@ int create_gpu_instance(const char* driver_path)
gpu_info.support_int8_arithmetic = false;
gpu_info.support_ycbcr_conversion = false;
gpu_info.support_cooperative_matrix = false;
gpu_info.support_cooperative_matrix_8_8_16 = false;
gpu_info.support_cooperative_matrix_16_8_8 = false;
gpu_info.support_cooperative_matrix_16_8_16 = false;
gpu_info.support_cooperative_matrix_16_16_16 = false;
Expand Down Expand Up @@ -1938,6 +1945,13 @@ int create_gpu_instance(const char* driver_path)
const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
// NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope);

if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
{
gpu_info.support_cooperative_matrix_8_8_16 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
Expand Down Expand Up @@ -1987,6 +2001,13 @@ int create_gpu_instance(const char* driver_path)
const VkCooperativeMatrixPropertiesNV& cmp = properties[j];
// NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope);

if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
{
gpu_info.support_cooperative_matrix_8_8_16 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
Expand Down Expand Up @@ -2028,9 +2049,9 @@ int create_gpu_instance(const char* driver_path)
gpu_info.subgroup_size, gpu_info.support_subgroup_basic, gpu_info.support_subgroup_vote,
gpu_info.support_subgroup_ballot, gpu_info.support_subgroup_shuffle);

NCNN_LOGE("[%u %s] fp16-matrix-16_8_8/16_8_16/16_16_16=%d/%d/%d", i, physicalDeviceProperties.deviceName,
gpu_info.support_cooperative_matrix_16_8_8, gpu_info.support_cooperative_matrix_16_8_16,
gpu_info.support_cooperative_matrix_16_16_16);
NCNN_LOGE("[%u %s] fp16-8x8x16/16x8x8/16x8x16/16x16x16=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName,
gpu_info.support_cooperative_matrix_8_8_16, gpu_info.support_cooperative_matrix_16_8_8,
gpu_info.support_cooperative_matrix_16_8_16, gpu_info.support_cooperative_matrix_16_16_16);

gpu_info_index++;
}
Expand Down
1 change: 1 addition & 0 deletions src/gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ class NCNN_EXPORT GpuInfo

// cooperative matrix feature
bool support_cooperative_matrix() const;
bool support_cooperative_matrix_8_8_16() const;
bool support_cooperative_matrix_16_8_8() const;
bool support_cooperative_matrix_16_8_16() const;
bool support_cooperative_matrix_16_16_16() const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,10 @@ void main()
const int lxd16 = lx / 16; // 0 1
const int lxm16 = lx % 16; // 0 1 2 3 .... 15

coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0;
coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1;
coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2;
coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3;
coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0;
coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1;
coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2;
coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3;

if (bias_term == 1)
{
Expand All @@ -93,17 +93,24 @@ void main()
coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor);
coopMatLoad(bias1, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);

#if NCNN_fp16_arithmetic
sum0 = bias0;
sum1 = bias0;
sum2 = bias1;
sum3 = bias1;
#else
sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
#endif
}
else
{
sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
sum3 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
}

const int N = psc(c) / 4;
Expand Down Expand Up @@ -201,6 +208,12 @@ void main()
if (gx >= psc(outcstep) || gy >= psc(outc))
return;

#if NCNN_fp16_arithmetic
coopMatStore(sum0, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum1, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum2, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum3, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
#else
coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum0);
coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum1);
coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum2);
Expand All @@ -210,6 +223,7 @@ void main()
coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
#endif

barrier();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,14 @@ void main()
const int lxd8 = lx / 8; // 0 1 2 3
const int lxm8 = lx % 8; // 0 1 2 3 .... 7

coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0;
coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1;
coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2;
coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3;
coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4;
coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5;
coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6;
coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6;
coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7;

if (bias_term == 1)
{
Expand All @@ -103,6 +103,16 @@ void main()
coopMatLoad(bias2, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
coopMatLoad(bias3, bias_data, gy + 6, 0, gl_CooperativeMatrixLayoutRowMajor);

#if NCNN_fp16_arithmetic
sum0 = bias0;
sum1 = bias0;
sum2 = bias1;
sum3 = bias1;
sum4 = bias2;
sum5 = bias2;
sum6 = bias3;
sum7 = bias3;
#else
sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias1);
Expand All @@ -111,17 +121,18 @@ void main()
sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias2);
sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
#endif
}
else
{
sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum3 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum4 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum3 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum4 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum5 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum6 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
sum7 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
}

const int N = psc(c) / 2;
Expand Down Expand Up @@ -247,6 +258,16 @@ void main()
if (gx >= psc(outcstep) || gy >= psc(outc))
return;

#if NCNN_fp16_arithmetic
coopMatStore(sum0, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum1, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum2, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum3, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum4, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum5, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum6, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum7, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
#else
coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum0);
coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum1);
coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum2);
Expand All @@ -264,6 +285,7 @@ void main()
coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
#endif

barrier();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,17 @@ void main()
const int lxd16 = lx / 16; // 0 1
const int lxm16 = lx % 16; // 0 1 2 3 .... 15

#if NCNN_fp16_arithmetic
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0;
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1;
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2;
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3;
#else
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;
#endif

if (bias_term == 1)
{
Expand All @@ -93,17 +100,31 @@ void main()
coopMatLoadNV(bias0, bias_data, gy, 0, false);
coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);

#if NCNN_fp16_arithmetic
sum0 = bias0;
sum1 = bias0;
sum2 = bias1;
sum3 = bias1;
#else
sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
#endif
}
else
{
#if NCNN_fp16_arithmetic
sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
#else
sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
#endif
}

const int N = psc(c) / 4;
Expand Down Expand Up @@ -201,6 +222,12 @@ void main()
if (gx >= psc(outcstep) || gy >= psc(outc))
return;

#if NCNN_fp16_arithmetic
coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
#else
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
Expand All @@ -210,6 +237,7 @@ void main()
coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
#endif

barrier();

Expand Down
Loading
Loading