Skip to content

Commit

Permalink
Added runtime detection
Browse files Browse the repository at this point in the history
Expanded the cache size to 93 (we will need this in near future)
Fixed detection of VAES, GFNI and VPCLMULQDQ
Could not test with `cupid` because they do not support these yet
  • Loading branch information
sayantn authored and Amanieu committed Jun 23, 2024
1 parent fd5fc64 commit 3721f9a
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 21 deletions.
15 changes: 15 additions & 0 deletions crates/std_detect/src/detect/arch/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ features! {
/// * `"avx512bf16"`
/// * `"avx512vp2intersect"`
/// * `"avx512fp16"`
/// * `"avxvnni"`
/// * `"avxifma"`
/// * `"avxneconvert"`
/// * `"avxvnniint8"`
/// * `"avxvnniint16"`
/// * `"f16c"`
/// * `"fma"`
/// * `"bmi1"`
Expand Down Expand Up @@ -172,6 +177,16 @@ features! {
/// AVX-512 P2INTERSECT
@FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512fp16: "avx512fp16";
/// AVX-512 FP16 (FLOAT16 instructions)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxifma: "avxifma";
/// AVX-IFMA (Integer Fused Multiply Add)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxneconvert: "avxneconvert";
/// AVX-NE-CONVERT (Exceptionless Convert)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnni: "avxvnni";
/// AVX-VNNI (Vector Neural Network Instructions)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnniint16: "avxvnniint16";
/// AVX-VNNI_INT8 (VNNI with 16-bit Integers)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnniint8: "avxvnniint8";
/// AVX-VNNI_INT16 (VNNI with 8-bit integers)
@FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] f16c: "f16c";
/// F16C (Conversions between IEEE-754 `binary16` and `binary32` formats)
@FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] fma: "fma";
Expand Down
30 changes: 19 additions & 11 deletions crates/std_detect/src/detect/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,30 +9,30 @@ use core::sync::atomic::AtomicUsize;

/// Sets the `bit` of `x`.
#[inline]
const fn set_bit(x: u64, bit: u32) -> u64 {
const fn set_bit(x: u128, bit: u32) -> u128 {
x | 1 << bit
}

/// Tests the `bit` of `x`.
#[inline]
const fn test_bit(x: u64, bit: u32) -> bool {
const fn test_bit(x: u128, bit: u32) -> bool {
x & (1 << bit) != 0
}

/// Unset the `bit of `x`.
#[inline]
const fn unset_bit(x: u64, bit: u32) -> u64 {
const fn unset_bit(x: u128, bit: u32) -> u128 {
x & !(1 << bit)
}

/// Maximum number of features that can be cached.
const CACHE_CAPACITY: u32 = 62;
const CACHE_CAPACITY: u32 = 93;

/// This type is used to initialize the cache
// The derived `Default` implementation will initialize the field to zero,
// which is what we want.
#[derive(Copy, Clone, Default)]
pub(crate) struct Initializer(u64);
pub(crate) struct Initializer(u128);

// NOTE: the `debug_assert!` would catch that we do not add more Features than
// the one fitting our cache.
Expand Down Expand Up @@ -71,10 +71,15 @@ impl Initializer {
}

/// This global variable is a cache of the features supported by the CPU.
// Note: on x64, we only use the first slot
static CACHE: [Cache; 2] = [Cache::uninitialized(), Cache::uninitialized()];

/// Feature cache with capacity for `size_of::<usize::MAX>() * 8 - 1` features.
// Note: the third slot is only used in x86
// Another Slot can be added if needed without any change to `Initializer`
static CACHE: [Cache; 3] = [
Cache::uninitialized(),
Cache::uninitialized(),
Cache::uninitialized(),
];

/// Feature cache with capacity for `size_of::<usize>() * 8 - 1` features.
///
/// Note: 0 is used to represent an uninitialized cache, and (at least) the most
/// significant bit is set on any cache which has been initialized.
Expand Down Expand Up @@ -102,7 +107,7 @@ impl Cache {
if cached == 0 {
None
} else {
Some(test_bit(cached as u64, bit))
Some(test_bit(cached as u128, bit))
}
}

Expand Down Expand Up @@ -173,6 +178,7 @@ cfg_if::cfg_if! {
fn do_initialize(value: Initializer) {
CACHE[0].initialize((value.0) as usize & Cache::MASK);
CACHE[1].initialize((value.0 >> Cache::CAPACITY) as usize & Cache::MASK);
CACHE[2].initialize((value.0 >> 2 * Cache::CAPACITY) as usize & Cache::MASK);
}

// We only have to detect features once, and it's fairly costly, so hint to LLVM
Expand Down Expand Up @@ -205,8 +211,10 @@ fn detect_and_initialize() -> Initializer {
pub(crate) fn test(bit: u32) -> bool {
let (relative_bit, idx) = if bit < Cache::CAPACITY {
(bit, 0)
} else {
} else if bit < 2 * Cache::CAPACITY {
(bit - Cache::CAPACITY, 1)
} else {
(bit - 2 * Cache::CAPACITY, 2)
};
CACHE[idx]
.test(relative_bit)
Expand Down
30 changes: 21 additions & 9 deletions crates/std_detect/src/detect/os/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,17 @@ pub(crate) fn detect_features() -> cache::Initializer {
extended_features_ecx,
extended_features_edx,
extended_features_eax_leaf_1,
extended_features_edx_leaf_1,
) = if max_basic_leaf >= 7 {
let CpuidResult { ebx, ecx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
let CpuidResult { eax: eax_1, .. } =
unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
(ebx, ecx, edx, eax_1)
let CpuidResult {
eax: eax_1,
edx: edx_1,
..
} = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
(ebx, ecx, edx, eax_1, edx_1)
} else {
(0, 0, 0, 0) // CPUID does not support "Extended Features"
(0, 0, 0, 0, 0) // CPUID does not support "Extended Features"
};

// EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
Expand Down Expand Up @@ -129,6 +133,10 @@ pub(crate) fn detect_features() -> cache::Initializer {
enable(proc_info_edx, 26, Feature::sse2);
enable(extended_features_ebx, 29, Feature::sha);

enable(extended_features_ecx, 8, Feature::gfni);
enable(extended_features_ecx, 9, Feature::vaes);
enable(extended_features_ecx, 10, Feature::vpclmulqdq);

enable(extended_features_ebx, 3, Feature::bmi1);
enable(extended_features_ebx, 8, Feature::bmi2);

Expand Down Expand Up @@ -165,8 +173,8 @@ pub(crate) fn detect_features() -> cache::Initializer {
let xcr0 = unsafe { _xgetbv(0) };
// Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
let os_avx_support = xcr0 & 6 == 6;
// Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 224`:
let os_avx512_support = xcr0 & 224 == 224;
// Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`:
let os_avx512_support = xcr0 & 0xe0 == 0xe0;

// Only if the OS and the CPU support saving/restoring the AVX
// registers we enable `xsave` support:
Expand Down Expand Up @@ -203,6 +211,13 @@ pub(crate) fn detect_features() -> cache::Initializer {
enable(proc_info_ecx, 28, Feature::avx);
enable(extended_features_ebx, 5, Feature::avx2);

// "Short" versions of AVX512 instructions
enable(extended_features_eax_leaf_1, 4, Feature::avxvnni);
enable(extended_features_eax_leaf_1, 23, Feature::avxifma);
enable(extended_features_edx_leaf_1, 4, Feature::avxvnniint8);
enable(extended_features_edx_leaf_1, 5, Feature::avxneconvert);
enable(extended_features_edx_leaf_1, 10, Feature::avxvnniint16);

// For AVX-512 the OS also needs to support saving/restoring
// the extended state, only then we enable AVX-512 support:
if os_avx512_support {
Expand All @@ -216,9 +231,6 @@ pub(crate) fn detect_features() -> cache::Initializer {
enable(extended_features_ebx, 31, Feature::avx512vl);
enable(extended_features_ecx, 1, Feature::avx512vbmi);
enable(extended_features_ecx, 6, Feature::avx512vbmi2);
enable(extended_features_ecx, 8, Feature::gfni);
enable(extended_features_ecx, 9, Feature::vaes);
enable(extended_features_ecx, 10, Feature::vpclmulqdq);
enable(extended_features_ecx, 11, Feature::avx512vnni);
enable(extended_features_ecx, 12, Feature::avx512bitalg);
enable(extended_features_ecx, 14, Feature::avx512vpopcntdq);
Expand Down
13 changes: 12 additions & 1 deletion crates/std_detect/tests/x86-specific.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#![allow(internal_features)]
#![feature(stdarch_internal)]
#![feature(stdarch_internal, avx512_target_feature)]

extern crate cupid;
#[macro_use]
Expand Down Expand Up @@ -68,6 +68,17 @@ fn dump() {
println!("adx: {:?}", is_x86_feature_detected!("adx"));
println!("rtm: {:?}", is_x86_feature_detected!("rtm"));
println!("movbe: {:?}", is_x86_feature_detected!("movbe"));
println!("avxvnni: {:?}", is_x86_feature_detected!("avxvnni"));
println!("avxvnniint8: {:?}", is_x86_feature_detected!("avxvnniint8"));
println!(
"avxneconvert: {:?}",
is_x86_feature_detected!("avxneconvert")
);
println!("avxifma: {:?}", is_x86_feature_detected!("avxifma"));
println!(
"avxvnniint16: {:?}",
is_x86_feature_detected!("avxvnniint16")
);
}

#[cfg(feature = "std_detect_env_override")]
Expand Down

0 comments on commit 3721f9a

Please sign in to comment.