From 74c85528d4362a3034f32148a2f167efbf54ab3f Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Sun, 23 Oct 2022 21:18:02 +0800
Subject: [PATCH 01/11] get_physical_cpu_count api family

---
 src/cpu.cpp | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/cpu.h   |   4 ++
 2 files changed, 114 insertions(+)
diff --git a/src/cpu.cpp b/src/cpu.cpp
index c84c6491157..ba3bf94dd4c 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -42,6 +42,11 @@
 #include <emscripten/threading.h>
 #endif
 
+#if defined _WIN32 && !(defined __MINGW32__)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
 #if defined __ANDROID__ || defined __linux__
 #if defined __ANDROID__
 #if __ANDROID_API__ >= 18
@@ -1164,6 +1169,111 @@ int get_big_cpu_count()
     return big_cpu_count ? big_cpu_count : g_cpucount;
 }
 
+#if defined __ANDROID__ || defined __linux__
+static int get_thread_siblings(int cpuid)
+{
+    char path[256];
+    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid);
+
+    FILE* fp = fopen(path, "rb");
+    if (!fp)
+        return -1;
+
+    int thread_siblings = -1;
+    int nscan = fscanf(fp, "%x", &thread_siblings);
+    if (nscan != 1)
+    {
+        // ignore
+    }
+
+    fclose(fp);
+
+    return thread_siblings;
+}
+#endif // defined __ANDROID__ || defined __linux__
+
+static int get_physical_cpucount()
+{
+    int count = 0;
+#if (defined _WIN32 && !(defined __MINGW32__))
+    typedef BOOL (WINAPI *LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+    LPFN_GLPI glpi = (LPFN_GLPI) GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
+    if (glpi == NULL)
+    {
+        NCNN_LOGE("GetLogicalProcessorInformation is not supported");
+        return g_cpucount;
+    }
+
+    DWORD return_length = 0;
+    glpi(NULL, &return_length);
+
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
+    glpi(buffer, &return_length);
+
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
+    DWORD byte_offset = 0;
+    while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
+    {
+        if (ptr->Relationship == RelationProcessorCore)
+        {
+            count++;
+        }
+
+        byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+        ptr++;
+    }
+
+    free(buffer);
+#elif defined __ANDROID__ || defined __linux__
+    std::vector<int> thread_set;
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        int thread_siblings = get_thread_siblings(i);
+        if (thread_siblings == -1)
+        {
+            // ignore malformed one
+            continue;
+        }
+
+        for (size_t j = 0; j < thread_set.size(); j++)
+        {
+            if (thread_set[j] == thread_siblings)
+                continue;
+
+            thread_set.push_back(thread_siblings);
+            count++;
+        }
+    }
+#elif __APPLE__
+    size_t len = sizeof(count);
+    sysctlbyname("hw.physicalcpu_max", &count, &len, NULL, 0);
+#else
+    count = g_cpucount;
+#endif
+
+    if (count > g_cpucount)
+        count = g_cpucount;
+
+    return count;
+}
+
+static int g_physical_cpucount = get_physical_cpucount();
+
+int get_physical_cpu_count()
+{
+    return g_physical_cpucount;
+}
+
+int get_physical_little_cpu_count()
+{
+    return g_physical_cpucount * 2 - g_cpucount;
+}
+
+int get_physical_big_cpu_count()
+{
+    return g_cpucount - g_physical_cpucount;
+}
+
 #if defined __ANDROID__ || defined __linux__
 static int get_max_freq_khz(int cpuid)
 {
diff --git a/src/cpu.h b/src/cpu.h
index 5a94106ef47..cdc8b229b72 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -110,6 +110,10 @@ NCNN_EXPORT int get_cpu_count();
 NCNN_EXPORT int get_little_cpu_count();
 NCNN_EXPORT int get_big_cpu_count();
 
+NCNN_EXPORT int get_physical_cpu_count();
+NCNN_EXPORT int get_physical_little_cpu_count();
+NCNN_EXPORT int get_physical_big_cpu_count();
+
 // bind all threads on little clusters if powersave enabled
 // affects HMP arch cpu like ARM big.LITTLE
 // only implemented on android at the moment

From a73032e7f86f5c1c63ff0b4aa56d398984946bb1 Mon Sep 17 00:00:00 2001
From: nihui <nihui@users.noreply.github.com>
Date: Sun, 23 Oct 2022 13:19:29 +0000
Subject: [PATCH 02/11] apply code-format changes

---
 src/cpu.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index ba3bf94dd4c..7971bd3a3d3 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -1196,8 +1196,8 @@ static int get_physical_cpucount()
 {
     int count = 0;
 #if (defined _WIN32 && !(defined __MINGW32__))
-    typedef BOOL (WINAPI *LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
-    LPFN_GLPI glpi = (LPFN_GLPI) GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
+    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
     if (glpi == NULL)
     {
         NCNN_LOGE("GetLogicalProcessorInformation is not supported");

From 32a65758c1704eda53db8563ace35ab7e155a079 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Mon, 24 Oct 2022 11:09:13 +0800
Subject: [PATCH 03/11] fix

---
 src/cpu.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index 7971bd3a3d3..6c3fc3c070d 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -1235,11 +1235,18 @@ static int get_physical_cpucount()
             continue;
         }
 
+        bool thread_siblings_exists = false;
         for (size_t j = 0; j < thread_set.size(); j++)
         {
             if (thread_set[j] == thread_siblings)
-                continue;
+            {
+                thread_siblings_exists = true;
+                break;
+            }
+        }
 
+        if (!thread_siblings_exists)
+        {
             thread_set.push_back(thread_siblings);
             count++;
         }

From 8cfdbf059bf8573f089e8d4566eb298cda2be2e8 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Mon, 24 Oct 2022 14:08:02 +0800
Subject: [PATCH 04/11] fix up system

---
 src/cpu.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index 6c3fc3c070d..b9ba9812aa1 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -1273,11 +1273,17 @@ int get_physical_cpu_count()
 
 int get_physical_little_cpu_count()
 {
+    if (g_physical_cpucount == g_cpucount)
+        return get_little_cpu_count();
+
     return g_physical_cpucount * 2 - g_cpucount;
 }
 
 int get_physical_big_cpu_count()
 {
+    if (g_physical_cpucount == g_cpucount)
+        return get_big_cpu_count();
+
     return g_cpucount - g_physical_cpucount;
 }
 

From bd868e7aab96e3bdecbb062c97565a6ea0c6da17 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Mon, 24 Oct 2022 14:18:01 +0800
Subject: [PATCH 05/11] set default to physical big cpu

---
 benchmark/benchncnn.cpp | 4 ++--
 src/option.cpp          | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp
index 1d4ae1d755e..714dca3180f 100644
--- a/benchmark/benchncnn.cpp
+++ b/benchmark/benchncnn.cpp
@@ -161,8 +161,8 @@ void benchmark(const char* comment, const ncnn::Mat& _in, const ncnn::Option& op
 int main(int argc, char** argv)
 {
     int loop_count = 4;
-    int num_threads = ncnn::get_cpu_count();
-    int powersave = 0;
+    int num_threads = ncnn::get_physical_big_cpu_count();
+    int powersave = 2;
     int gpu_device = -1;
     int cooling_down = 1;
 
diff --git a/src/option.cpp b/src/option.cpp
index 4aabfdde5ed..80d4455307e 100644
--- a/src/option.cpp
+++ b/src/option.cpp
@@ -21,7 +21,7 @@ namespace ncnn {
 Option::Option()
 {
     lightmode = true;
-    num_threads = get_big_cpu_count();
+    num_threads = get_physical_big_cpu_count();
     blob_allocator = 0;
     workspace_allocator = 0;
 

From d9b76919e113a77e0b5591c09df08958195848e3 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Thu, 27 Oct 2022 17:22:12 +0800
Subject: [PATCH 06/11] always treat smt core as big core

---
 src/cpu.cpp | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index b9ba9812aa1..75221822216 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -1362,6 +1362,27 @@ static int get_max_freq_khz(int cpuid)
     return max_freq_khz;
 }
 
+static int get_thread_siblings_list_count(int cpuid)
+{
+    char path[256];
+    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid);
+
+    FILE* fp = fopen(path, "rb");
+    if (!fp)
+        return 1;
+
+    int count = 1;
+    while (!feof(fp))
+    {
+        if (fgetc(fp) == ',')
+            count++;
+    }
+
+    fclose(fp);
+
+    return count;
+}
+
 static int set_sched_affinity(const CpuSet& thread_affinity_mask)
 {
     // set affinity for thread
@@ -1480,6 +1501,13 @@ static int setup_thread_affinity_masks()
 
     for (int i = 0; i < g_cpucount; i++)
     {
+        if (get_thread_siblings_list_count(i) > 1)
+        {
+            // always treat smt core as big core
+            g_thread_affinity_mask_big.enable(i);
+            continue;
+        }
+
         if (cpu_max_freq_khz[i] < max_freq_khz_medium)
             g_thread_affinity_mask_little.enable(i);
         else

From 59baf2239dea107f140286030672c8799ca43c8f Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Fri, 28 Oct 2022 11:09:45 +0800
Subject: [PATCH 07/11] is_smt_cpu

---
 src/cpu.cpp | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index 75221822216..4ce9edcd532 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -1362,25 +1362,37 @@ static int get_max_freq_khz(int cpuid)
     return max_freq_khz;
 }
 
-static int get_thread_siblings_list_count(int cpuid)
+static bool is_smt_cpu(int cpuid)
 {
+    // https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-72
     char path[256];
-    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid);
+    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/core_cpus_list", cpuid);
 
     FILE* fp = fopen(path, "rb");
+
     if (!fp)
-        return 1;
+    {
+        sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid);
+        fp = fopen(path, "rb");
+
+        if (!fp)
+            return false;
+    }
 
-    int count = 1;
+    bool is_smt = false;
     while (!feof(fp))
     {
-        if (fgetc(fp) == ',')
-            count++;
+        char ch = fgetc(fp);
+        if (ch == ',' || ch == '-')
+        {
+            is_smt = true;
+            break;
+        }
     }
 
     fclose(fp);
 
-    return count;
+    return is_smt;
 }
 
 static int set_sched_affinity(const CpuSet& thread_affinity_mask)
@@ -1501,7 +1513,7 @@ static int setup_thread_affinity_masks()
 
     for (int i = 0; i < g_cpucount; i++)
     {
-        if (get_thread_siblings_list_count(i) > 1)
+        if (is_smt_cpu(i))
         {
             // always treat smt core as big core
             g_thread_affinity_mask_big.enable(i);

From aa1f4fc387eaacfee4d4c13e835584ed2da8cea8 Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Sun, 30 Oct 2022 21:36:08 +0800
Subject: [PATCH 08/11] get max freq mhz on windows

---
 src/cpu.cpp | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 141 insertions(+), 1 deletion(-)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index 4ce9edcd532..8533a4d0bfd 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -45,6 +45,7 @@
 #if defined _WIN32 && !(defined __MINGW32__)
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
+#include <powerbase.h>
 #endif
 
 #if defined __ANDROID__ || defined __linux__
@@ -1114,6 +1115,10 @@ static int get_cpucount()
         count = emscripten_num_logical_cores();
     else
         count = 1;
+#elif (defined _WIN32 && !(defined __MINGW32__))
+    SYSTEM_INFO system_info;
+    GetSystemInfo(&system_info);
+    count = system_info.dwNumberOfProcessors;
 #elif defined __ANDROID__ || defined __linux__
     // get cpu count from /proc/cpuinfo
     FILE* fp = fopen("/proc/cpuinfo", "rb");
@@ -1287,6 +1292,100 @@ int get_physical_big_cpu_count()
     return g_cpucount - g_physical_cpucount;
 }
 
+#if (defined _WIN32 && !(defined __MINGW32__))
+static int count_set_bits(ULONG_PTR bitMask)
+{
+    DWORD LSHIFT = sizeof(ULONG_PTR) * 8 - 1;
+    int bitSetCount = 0;
+    ULONG_PTR bitTest = (ULONG_PTR)1 << LSHIFT;
+    DWORD i;
+
+    for (i = 0; i <= LSHIFT; ++i)
+    {
+        bitSetCount += ((bitMask & bitTest) ? 1 : 0);
+        bitTest /= 2;
+    }
+
+    return bitSetCount;
+}
+
+static ULONG_PTR get_smt_cpu_mask()
+{
+    ULONG_PTR smt_cpu_mask = 0;
+
+    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
+    if (glpi == NULL)
+    {
+        NCNN_LOGE("GetLogicalProcessorInformation is not supported");
+        return 0;
+    }
+
+    DWORD return_length = 0;
+    glpi(NULL, &return_length);
+
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
+    glpi(buffer, &return_length);
+
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
+    DWORD byte_offset = 0;
+    while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
+    {
+        if (ptr->Relationship == RelationProcessorCore)
+        {
+            int smt_count = count_set_bits(ptr->ProcessorMask);
+            if (smt_count > 1)
+            {
+                // this core is smt
+                smt_cpu_mask |= ptr->ProcessorMask;
+            }
+        }
+
+        byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+        ptr++;
+    }
+
+    free(buffer);
+
+    return smt_cpu_mask;
+}
+
+static std::vector<int> get_max_freq_mhz()
+{
+    typedef struct _PROCESSOR_POWER_INFORMATION {
+        ULONG Number;
+        ULONG MaxMhz;
+        ULONG CurrentMhz;
+        ULONG MhzLimit;
+        ULONG MaxIdleState;
+        ULONG CurrentIdleState;
+    } PROCESSOR_POWER_INFORMATION, * PPROCESSOR_POWER_INFORMATION;
+
+    typedef LONG(WINAPI * LPFN_CNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG);
+    LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(GetModuleHandle(TEXT("powrprof")), "CallNtPowerInformation");
+    if (cnpi == NULL)
+    {
+        NCNN_LOGE("CallNtPowerInformation is not supported");
+        return std::vector<int>(g_cpucount, 0);
+    }
+
+    DWORD return_length = sizeof(PROCESSOR_POWER_INFORMATION) * g_cpucount;
+    PPROCESSOR_POWER_INFORMATION buffer = (PPROCESSOR_POWER_INFORMATION)malloc(return_length);
+
+    cnpi(ProcessorInformation, NULL, 0, buffer, return_length);
+
+    std::vector<int> ret;
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        ULONG max_mhz = buffer[i].MaxMhz;
+        ret.push_back(max_mhz);
+    }
+
+    free(buffer);
+    return ret;
+}
+#endif // (defined _WIN32 && !(defined __MINGW32__))
+
 #if defined __ANDROID__ || defined __linux__
 static int get_max_freq_khz(int cpuid)
 {
@@ -1485,7 +1584,48 @@ static int setup_thread_affinity_masks()
 {
     g_thread_affinity_mask_all.disable_all();
 
-#if defined __ANDROID__ || defined __linux__
+#if (defined _WIN32 && !(defined __MINGW32__))
+    // get max freq mhz for all cores
+    int max_freq_mhz_min = INT_MAX;
+    int max_freq_mhz_max = 0;
+    std::vector<int> all_max_freq_mhz = get_max_freq_mhz();
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        int max_freq_mhz = all_max_freq_mhz[i];
+
+        // NCNN_LOGE("%d max freq = %d khz", i, max_freq_mhz);
+
+        if (max_freq_mhz > max_freq_mhz_max)
+            max_freq_mhz_max = max_freq_mhz;
+        if (max_freq_mhz < max_freq_mhz_min)
+            max_freq_mhz_min = max_freq_mhz;
+    }
+
+    int max_freq_mhz_medium = (max_freq_mhz_min + max_freq_mhz_max) / 2;
+    if (max_freq_mhz_medium == max_freq_mhz_max)
+    {
+        g_thread_affinity_mask_little.disable_all();
+        g_thread_affinity_mask_big = g_thread_affinity_mask_all;
+        return 0;
+    }
+
+    ULONG_PTR smt_cpu_mask = get_smt_cpu_mask();
+
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        if (smt_cpu_mask & (1 << i))
+        {
+            // always treat smt core as big core
+            g_thread_affinity_mask_big.enable(i);
+            continue;
+        }
+
+        if (cpu_max_freq_mhz[i] < max_freq_mhz_medium)
+            g_thread_affinity_mask_little.enable(i);
+        else
+            g_thread_affinity_mask_big.enable(i);
+    }
+#elif defined __ANDROID__ || defined __linux__
     int max_freq_khz_min = INT_MAX;
     int max_freq_khz_max = 0;
     std::vector<int> cpu_max_freq_khz(g_cpucount);

From 0830f763d9b500716065861ee6ee8969c18e2662 Mon Sep 17 00:00:00 2001
From: nihui <nihui@users.noreply.github.com>
Date: Sun, 30 Oct 2022 13:37:46 +0000
Subject: [PATCH 09/11] apply code-format changes

---
 src/cpu.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index 8533a4d0bfd..45e0a06cc0c 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -1352,14 +1352,15 @@ static ULONG_PTR get_smt_cpu_mask()
 
 static std::vector<int> get_max_freq_mhz()
 {
-    typedef struct _PROCESSOR_POWER_INFORMATION {
+    typedef struct _PROCESSOR_POWER_INFORMATION
+    {
         ULONG Number;
         ULONG MaxMhz;
         ULONG CurrentMhz;
         ULONG MhzLimit;
         ULONG MaxIdleState;
         ULONG CurrentIdleState;
-    } PROCESSOR_POWER_INFORMATION, * PPROCESSOR_POWER_INFORMATION;
+    } PROCESSOR_POWER_INFORMATION, *PPROCESSOR_POWER_INFORMATION;
 
     typedef LONG(WINAPI * LPFN_CNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG);
     LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(GetModuleHandle(TEXT("powrprof")), "CallNtPowerInformation");

From b9815096749f13c6af09914a119023ca7826cd15 Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Sun, 30 Oct 2022 22:02:56 +0800
Subject: [PATCH 10/11] windows thread affinity

---
 src/cpu.cpp | 92 +++++++++++++++++++++++++++++++++++++----------------
 src/cpu.h   |  7 ++++
 2 files changed, 72 insertions(+), 27 deletions(-)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index 45e0a06cc0c..96a02ebe957 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -302,7 +302,44 @@ static int g_hw_optional_arm_FEAT_BF16 = get_hw_capability("hw.optional.arm.FEAT
 static int g_hw_optional_arm_FEAT_I8MM = get_hw_capability("hw.optional.arm.FEAT_I8MM");
 #endif // __APPLE__
 
-#if defined __ANDROID__ || defined __linux__
+#if (defined _WIN32 && !(defined __MINGW32__))
+CpuSet::CpuSet()
+{
+    disable_all();
+}
+
+void CpuSet::enable(int cpu)
+{
+    mask |= (1 << cpu);
+}
+
+void CpuSet::disable(int cpu)
+{
+    mask &= ~(1 << cpu);
+}
+
+void CpuSet::disable_all()
+{
+    mask = 0;
+}
+
+bool CpuSet::is_enabled(int cpu) const
+{
+    return mask & (1 << cpu);
+}
+
+int CpuSet::num_enabled() const
+{
+    int num_enabled = 0;
+    for (int i = 0; i < (int)sizeof(mask) * 8; i++)
+    {
+        if (is_enabled(i))
+            num_enabled++;
+    }
+
+    return num_enabled;
+}
+#elif defined __ANDROID__ || defined __linux__
 CpuSet::CpuSet()
 {
     disable_all();
@@ -1293,32 +1330,16 @@ int get_physical_big_cpu_count()
 }
 
 #if (defined _WIN32 && !(defined __MINGW32__))
-static int count_set_bits(ULONG_PTR bitMask)
-{
-    DWORD LSHIFT = sizeof(ULONG_PTR) * 8 - 1;
-    int bitSetCount = 0;
-    ULONG_PTR bitTest = (ULONG_PTR)1 << LSHIFT;
-    DWORD i;
-
-    for (i = 0; i <= LSHIFT; ++i)
-    {
-        bitSetCount += ((bitMask & bitTest) ? 1 : 0);
-        bitTest /= 2;
-    }
-
-    return bitSetCount;
-}
-
-static ULONG_PTR get_smt_cpu_mask()
+static CpuSet get_smt_cpu_mask()
 {
-    ULONG_PTR smt_cpu_mask = 0;
+    CpuSet smt_cpu_mask;
 
     typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
     LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
     if (glpi == NULL)
     {
         NCNN_LOGE("GetLogicalProcessorInformation is not supported");
-        return 0;
+        return smt_cpu_mask;
     }
 
     DWORD return_length = 0;
@@ -1333,11 +1354,12 @@ static ULONG_PTR get_smt_cpu_mask()
     {
         if (ptr->Relationship == RelationProcessorCore)
         {
-            int smt_count = count_set_bits(ptr->ProcessorMask);
-            if (smt_count > 1)
+            CpuSet smt_set;
+            smt_set.mask = ptr->ProcessorMask;
+            if (smt_set.num_enabled() > 1)
             {
                 // this core is smt
-                smt_cpu_mask |= ptr->ProcessorMask;
+                smt_cpu_mask.mask |= smt_set.mask;
             }
         }
 
@@ -1362,11 +1384,14 @@ static std::vector<int> get_max_freq_mhz()
         ULONG CurrentIdleState;
     } PROCESSOR_POWER_INFORMATION, *PPROCESSOR_POWER_INFORMATION;
 
+    HMODULE powrprof = LoadLibrary(TEXT("powrprof.dll"));
+
     typedef LONG(WINAPI * LPFN_CNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG);
-    LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(GetModuleHandle(TEXT("powrprof")), "CallNtPowerInformation");
+    LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(powrprof, "CallNtPowerInformation");
     if (cnpi == NULL)
     {
         NCNN_LOGE("CallNtPowerInformation is not supported");
+        FreeLibrary(powrprof);
         return std::vector<int>(g_cpucount, 0);
     }
 
@@ -1383,8 +1408,21 @@ static std::vector<int> get_max_freq_mhz()
     }
 
     free(buffer);
+    FreeLibrary(powrprof);
     return ret;
 }
+
+static int set_sched_affinity(const CpuSet& thread_affinity_mask)
+{
+    DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
+    if (prev_mask == 0)
+    {
+        NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
+        return -1;
+    }
+
+    return 0;
+}
 #endif // (defined _WIN32 && !(defined __MINGW32__))
 
 #if defined __ANDROID__ || defined __linux__
@@ -1610,11 +1648,11 @@ static int setup_thread_affinity_masks()
         return 0;
     }
 
-    ULONG_PTR smt_cpu_mask = get_smt_cpu_mask();
+    CpuSet smt_cpu_mask = get_smt_cpu_mask();
 
     for (int i = 0; i < g_cpucount; i++)
     {
-        if (smt_cpu_mask & (1 << i))
+        if (smt_cpu_mask.is_enabled(i))
         {
             // always treat smt core as big core
             g_thread_affinity_mask_big.enable(i);
@@ -1781,7 +1819,7 @@ const CpuSet& get_cpu_thread_affinity_mask(int powersave)
 
 int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
 {
-#if defined __ANDROID__ || defined __linux__
+#if defined __ANDROID__ || defined __linux__ || (defined _WIN32 && !(defined __MINGW32__))
     int num_threads = thread_affinity_mask.num_enabled();
 
 #ifdef _OPENMP
diff --git a/src/cpu.h b/src/cpu.h
index cdc8b229b72..c38061df0b1 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -17,6 +17,10 @@
 
 #include <stddef.h>
 
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
 #if defined __ANDROID__ || defined __linux__
 #include <sched.h> // cpu_set_t
 #endif
@@ -36,6 +40,9 @@ class NCNN_EXPORT CpuSet
     int num_enabled() const;
 
 public:
+#if (defined _WIN32 && !(defined __MINGW32__))
+    ULONG_PTR mask;
+#endif
 #if defined __ANDROID__ || defined __linux__
     cpu_set_t cpu_set;
 #endif

From 73a8140de390b19169eaa2083d8e740279007c1c Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Sun, 30 Oct 2022 22:10:30 +0800
Subject: [PATCH 11/11] fix build

---
 src/cpu.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index 96a02ebe957..eb012c31587 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -1627,10 +1627,10 @@ static int setup_thread_affinity_masks()
     // get max freq mhz for all cores
     int max_freq_mhz_min = INT_MAX;
     int max_freq_mhz_max = 0;
-    std::vector<int> all_max_freq_mhz = get_max_freq_mhz();
+    std::vector<int> cpu_max_freq_mhz = get_max_freq_mhz();
     for (int i = 0; i < g_cpucount; i++)
     {
-        int max_freq_mhz = all_max_freq_mhz[i];
+        int max_freq_mhz = cpu_max_freq_mhz[i];
 
         // NCNN_LOGE("%d max freq = %d khz", i, max_freq_mhz);