Added runtime detection

Expanded the cache size to 93 (we will need this in near future) Fixed detection of VAES, GFNI and VPCLMULQDQ Could not test with `cupid` because they do not support these yet
rust-lang · Jun 23, 2024 · 3721f9a · 3721f9a
1 parent fd5fc64
commit 3721f9a
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 21 deletions.
diff --git a/crates/std_detect/src/detect/arch/x86.rs b/crates/std_detect/src/detect/arch/x86.rs
@@ -76,6 +76,11 @@ features! {
     /// * `"avx512bf16"`
     /// * `"avx512vp2intersect"`
     /// * `"avx512fp16"`
+    /// * `"avxvnni"`
+    /// * `"avxifma"`
+    /// * `"avxneconvert"`
+    /// * `"avxvnniint8"`
+    /// * `"avxvnniint16"`
     /// * `"f16c"`
     /// * `"fma"`
     /// * `"bmi1"`
@@ -172,6 +177,16 @@ features! {
     /// AVX-512 P2INTERSECT
     @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512fp16: "avx512fp16";
     /// AVX-512 FP16 (FLOAT16 instructions)
+    @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxifma: "avxifma";
+    /// AVX-IFMA (Integer Fused Multiply Add)
+    @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxneconvert: "avxneconvert";
+    /// AVX-NE-CONVERT (Exceptionless Convert)
+    @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnni: "avxvnni";
+    /// AVX-VNNI (Vector Neural Network Instructions)
+    @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnniint16: "avxvnniint16";
+    /// AVX-VNNI_INT8 (VNNI with 16-bit Integers)
+    @FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnniint8: "avxvnniint8";
+    /// AVX-VNNI_INT16 (VNNI with 8-bit integers)
     @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] f16c: "f16c";
     /// F16C (Conversions between IEEE-754 `binary16` and `binary32` formats)
     @FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] fma: "fma";

diff --git a/crates/std_detect/src/detect/cache.rs b/crates/std_detect/src/detect/cache.rs
@@ -9,30 +9,30 @@ use core::sync::atomic::AtomicUsize;
 
 /// Sets the `bit` of `x`.
 #[inline]
-const fn set_bit(x: u64, bit: u32) -> u64 {
+const fn set_bit(x: u128, bit: u32) -> u128 {
     x | 1 << bit
 }
 
 /// Tests the `bit` of `x`.
 #[inline]
-const fn test_bit(x: u64, bit: u32) -> bool {
+const fn test_bit(x: u128, bit: u32) -> bool {
     x & (1 << bit) != 0
 }
 
 /// Unset the `bit of `x`.
 #[inline]
-const fn unset_bit(x: u64, bit: u32) -> u64 {
+const fn unset_bit(x: u128, bit: u32) -> u128 {
     x & !(1 << bit)
 }
 
 /// Maximum number of features that can be cached.
-const CACHE_CAPACITY: u32 = 62;
+const CACHE_CAPACITY: u32 = 93;
 
 /// This type is used to initialize the cache
 // The derived `Default` implementation will initialize the field to zero,
 // which is what we want.
 #[derive(Copy, Clone, Default)]
-pub(crate) struct Initializer(u64);
+pub(crate) struct Initializer(u128);
 
 // NOTE: the `debug_assert!` would catch that we do not add more Features than
 // the one fitting our cache.
@@ -71,10 +71,15 @@ impl Initializer {
 }
 
 /// This global variable is a cache of the features supported by the CPU.
-// Note: on x64, we only use the first slot
-static CACHE: [Cache; 2] = [Cache::uninitialized(), Cache::uninitialized()];
-
-/// Feature cache with capacity for `size_of::<usize::MAX>() * 8 - 1` features.
+// Note: the third slot is only used in x86
+// Another Slot can be added if needed without any change to `Initializer`
+static CACHE: [Cache; 3] = [
+    Cache::uninitialized(),
+    Cache::uninitialized(),
+    Cache::uninitialized(),
+];
+
+/// Feature cache with capacity for `size_of::<usize>() * 8 - 1` features.
 ///
 /// Note: 0 is used to represent an uninitialized cache, and (at least) the most
 /// significant bit is set on any cache which has been initialized.
@@ -102,7 +107,7 @@ impl Cache {
         if cached == 0 {
             None
         } else {
-            Some(test_bit(cached as u64, bit))
+            Some(test_bit(cached as u128, bit))
         }
     }
 
@@ -173,6 +178,7 @@ cfg_if::cfg_if! {
 fn do_initialize(value: Initializer) {
     CACHE[0].initialize((value.0) as usize & Cache::MASK);
     CACHE[1].initialize((value.0 >> Cache::CAPACITY) as usize & Cache::MASK);
+    CACHE[2].initialize((value.0 >> 2 * Cache::CAPACITY) as usize & Cache::MASK);
 }
 
 // We only have to detect features once, and it's fairly costly, so hint to LLVM
@@ -205,8 +211,10 @@ fn detect_and_initialize() -> Initializer {
 pub(crate) fn test(bit: u32) -> bool {
     let (relative_bit, idx) = if bit < Cache::CAPACITY {
         (bit, 0)
-    } else {
+    } else if bit < 2 * Cache::CAPACITY {
         (bit - Cache::CAPACITY, 1)
+    } else {
+        (bit - 2 * Cache::CAPACITY, 2)
     };
     CACHE[idx]
         .test(relative_bit)

diff --git a/crates/std_detect/src/detect/os/x86.rs b/crates/std_detect/src/detect/os/x86.rs
@@ -74,13 +74,17 @@ pub(crate) fn detect_features() -> cache::Initializer {
         extended_features_ecx,
         extended_features_edx,
         extended_features_eax_leaf_1,
+        extended_features_edx_leaf_1,
     ) = if max_basic_leaf >= 7 {
         let CpuidResult { ebx, ecx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
-        let CpuidResult { eax: eax_1, .. } =
-            unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
-        (ebx, ecx, edx, eax_1)
+        let CpuidResult {
+            eax: eax_1,
+            edx: edx_1,
+            ..
+        } = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
+        (ebx, ecx, edx, eax_1, edx_1)
     } else {
-        (0, 0, 0, 0) // CPUID does not support "Extended Features"
+        (0, 0, 0, 0, 0) // CPUID does not support "Extended Features"
     };
 
     // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
@@ -129,6 +133,10 @@ pub(crate) fn detect_features() -> cache::Initializer {
         enable(proc_info_edx, 26, Feature::sse2);
         enable(extended_features_ebx, 29, Feature::sha);
 
+        enable(extended_features_ecx, 8, Feature::gfni);
+        enable(extended_features_ecx, 9, Feature::vaes);
+        enable(extended_features_ecx, 10, Feature::vpclmulqdq);
+
         enable(extended_features_ebx, 3, Feature::bmi1);
         enable(extended_features_ebx, 8, Feature::bmi2);
 
@@ -165,8 +173,8 @@ pub(crate) fn detect_features() -> cache::Initializer {
                 let xcr0 = unsafe { _xgetbv(0) };
                 // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
                 let os_avx_support = xcr0 & 6 == 6;
-                // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 224`:
-                let os_avx512_support = xcr0 & 224 == 224;
+                // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`:
+                let os_avx512_support = xcr0 & 0xe0 == 0xe0;
 
                 // Only if the OS and the CPU support saving/restoring the AVX
                 // registers we enable `xsave` support:
@@ -203,6 +211,13 @@ pub(crate) fn detect_features() -> cache::Initializer {
                     enable(proc_info_ecx, 28, Feature::avx);
                     enable(extended_features_ebx, 5, Feature::avx2);
 
+                    // "Short" versions of AVX512 instructions
+                    enable(extended_features_eax_leaf_1, 4, Feature::avxvnni);
+                    enable(extended_features_eax_leaf_1, 23, Feature::avxifma);
+                    enable(extended_features_edx_leaf_1, 4, Feature::avxvnniint8);
+                    enable(extended_features_edx_leaf_1, 5, Feature::avxneconvert);
+                    enable(extended_features_edx_leaf_1, 10, Feature::avxvnniint16);
+
                     // For AVX-512 the OS also needs to support saving/restoring
                     // the extended state, only then we enable AVX-512 support:
                     if os_avx512_support {
@@ -216,9 +231,6 @@ pub(crate) fn detect_features() -> cache::Initializer {
                         enable(extended_features_ebx, 31, Feature::avx512vl);
                         enable(extended_features_ecx, 1, Feature::avx512vbmi);
                         enable(extended_features_ecx, 6, Feature::avx512vbmi2);
-                        enable(extended_features_ecx, 8, Feature::gfni);
-                        enable(extended_features_ecx, 9, Feature::vaes);
-                        enable(extended_features_ecx, 10, Feature::vpclmulqdq);
                         enable(extended_features_ecx, 11, Feature::avx512vnni);
                         enable(extended_features_ecx, 12, Feature::avx512bitalg);
                         enable(extended_features_ecx, 14, Feature::avx512vpopcntdq);

diff --git a/crates/std_detect/tests/x86-specific.rs b/crates/std_detect/tests/x86-specific.rs
@@ -1,6 +1,6 @@
 #![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #![allow(internal_features)]
-#![feature(stdarch_internal)]
+#![feature(stdarch_internal, avx512_target_feature)]
 
 extern crate cupid;
 #[macro_use]
@@ -68,6 +68,17 @@ fn dump() {
     println!("adx: {:?}", is_x86_feature_detected!("adx"));
     println!("rtm: {:?}", is_x86_feature_detected!("rtm"));
     println!("movbe: {:?}", is_x86_feature_detected!("movbe"));
+    println!("avxvnni: {:?}", is_x86_feature_detected!("avxvnni"));
+    println!("avxvnniint8: {:?}", is_x86_feature_detected!("avxvnniint8"));
+    println!(
+        "avxneconvert: {:?}",
+        is_x86_feature_detected!("avxneconvert")
+    );
+    println!("avxifma: {:?}", is_x86_feature_detected!("avxifma"));
+    println!(
+        "avxvnniint16: {:?}",
+        is_x86_feature_detected!("avxvnniint16")
+    );
 }
 
 #[cfg(feature = "std_detect_env_override")]