diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 1ad80147cf..9d95f0c492 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -1784,113 +1784,113 @@ * [x] [`_mm512_setzero_si512`] * [x] [`_mm512_setzero`] * [x] [`_mm512_load_epi32`] - * [ ] [`_mm512_mask_load_epi32`] //need i1 - * [ ] [`_mm512_maskz_load_epi32`] //need i1 + * [x] [`_mm512_mask_load_epi32`] //need i1 + * [x] [`_mm512_maskz_load_epi32`] //need i1 * [x] [`_mm_load_epi32`] - * [_] [`_mm_mask_load_epi32`] //need i1 - * [_] [`_mm_maskz_load_epi32`] //need i1 + * [x] [`_mm_mask_load_epi32`] //need i1 + * [x] [`_mm_maskz_load_epi32`] //need i1 * [x] [`_mm256_load_epi32`] - * [_] [`_mm256_mask_load_epi32`] //need i1 - * [_] [`_mm256_maskz_load_epi32`] //need i1 + * [x] [`_mm256_mask_load_epi32`] //need i1 + * [x] [`_mm256_maskz_load_epi32`] //need i1 * [x] [`_mm512_load_epi64`] - * [ ] [`_mm512_mask_load_epi64`] //need i1 - * [ ] [`_mm512_maskz_load_epi64`] //need i1 + * [x] [`_mm512_mask_load_epi64`] //need i1 + * [x] [`_mm512_maskz_load_epi64`] //need i1 * [x] [`_mm_load_epi64`] //need i1 - * [_] [`_mm_mask_load_epi64`] //need i1 - * [_] [`_mm_maskz_load_epi64`] //need i1 + * [x] [`_mm_mask_load_epi64`] //need i1 + * [x] [`_mm_maskz_load_epi64`] //need i1 * [x] [`_mm256_load_epi64`] //need i1 - * [_] [`_mm256_mask_load_epi64`] //need i1 - * [_] [`_mm256_maskz_load_epi64`] //need i1 + * [x] [`_mm256_mask_load_epi64`] //need i1 + * [x] [`_mm256_maskz_load_epi64`] //need i1 * [x] [`_mm512_load_ps`] - * [ ] [`_mm512_mask_load_ps`] //need i1 - * [ ] [`_mm512_maskz_load_ps`] //need i1 - * [_] [`_mm_maskz_load_ps`] //need i - * [_] [`_mm_mask_load_ps`] //need i1 - * [_] [`_mm_maskz_load_ps`] //need i1 - * [_] [`_mm256_mask_load_ps`] //need i1 - * [_] [`_mm256_maskz_load_ps`] //need i1 + * [x] [`_mm512_mask_load_ps`] //need i1 + * [x] [`_mm512_maskz_load_ps`] //need i1 + * [x] [`_mm_maskz_load_ps`] //need i + * [x] [`_mm_mask_load_ps`] //need i1 + * [x] [`_mm_maskz_load_ps`] //need i1 + * [x] [`_mm256_mask_load_ps`] //need i1 + * [x] [`_mm256_maskz_load_ps`] //need i1 * [x] [`_mm512_load_pd`] - * [ ] [`_mm512_mask_load_pd`] //need i1 - * [ ] [`_mm512_maskz_load_pd`] //need i1 - * [_] [`_mm_mask_load_pd`] //need i1 - * [_] [`_mm_maskz_load_pd`] //need i1 - * [_] [`_mm256_mask_load_pd`] //need i1 - * [_] [`_mm256_maskz_load_pd`] //need i1 + * [x] [`_mm512_mask_load_pd`] //need i1 + * [x] [`_mm512_maskz_load_pd`] //need i1 + * [x] [`_mm_mask_load_pd`] //need i1 + * [x] [`_mm_maskz_load_pd`] //need i1 + * [x] [`_mm256_mask_load_pd`] //need i1 + * [x] [`_mm256_maskz_load_pd`] //need i1 * [x] [`_mm512_load_si512`] * [x] [`_mm512_loadu_epi32`] - * [ ] [`_mm512_mask_loadu_epi32`] //need i1 + * [x] [`_mm512_mask_loadu_epi32`] //need i1 * [x] [`_mm_loadu_epi32`] - * [_] [`_mm_mask_loadu_epi32`] //need i1 - * [_] [`_mm_maskz_loadu_epi32`] //need i1 - * [ ] [`_mm512_maskz_loadu_epi32`] //need i1 + * [x] [`_mm_mask_loadu_epi32`] //need i1 + * [x] [`_mm_maskz_loadu_epi32`] //need i1 + * [x] [`_mm512_maskz_loadu_epi32`] //need i1 * [x] [`_mm256_loadu_epi32`] - * [_] [`_mm256_mask_loadu_epi32`] //need i1 - * [_] [`_mm256_maskz_loadu_epi32`] //need i1 + * [x] [`_mm256_mask_loadu_epi32`] //need i1 + * [x] [`_mm256_maskz_loadu_epi32`] //need i1 * [x] [`_mm512_loadu_epi64`] - * [ ] [`_mm512_mask_loadu_epi64`] //need i1 - * [ ] [`_mm512_maskz_loadu_epi64`] //need i1 + * [x] [`_mm512_mask_loadu_epi64`] //need i1 + * [x] [`_mm512_maskz_loadu_epi64`] //need i1 * [x] [`_mm_loadu_epi64`] - * [_] [`_mm_mask_loadu_epi64`] //need i1 - * [_] [`_mm_maskz_loadu_epi64`] //need i1 + * [x] [`_mm_mask_loadu_epi64`] //need i1 + * [x] [`_mm_maskz_loadu_epi64`] //need i1 * [x] [`_mm256_loadu_epi64`] - * [_] [`_mm256_mask_loadu_epi64`] //need i1 - * [_] [`_mm256_maskz_loadu_epi64`] //need i1 + * [x] [`_mm256_mask_loadu_epi64`] //need i1 + * [x] [`_mm256_maskz_loadu_epi64`] //need i1 * [x] [`_mm512_loadu_ps`] - * [ ] [`_mm512_mask_loadu_ps`] //need i1 - * [ ] [`_mm512_maskz_loadu_ps`] //need i1 - * [_] [`_mm_mask_loadu_ps`] //need i1 - * [_] [`_mm_maskz_loadu_ps`] //need i1 - * [_] [`_mm256_mask_loadu_ps`] //need i1 - * [_] [`_mm256_maskz_loadu_ps`] //need i1 + * [x] [`_mm512_mask_loadu_ps`] //need i1 + * [x] [`_mm512_maskz_loadu_ps`] //need i1 + * [x] [`_mm_mask_loadu_ps`] //need i1 + * [x] [`_mm_maskz_loadu_ps`] //need i1 + * [x] [`_mm256_mask_loadu_ps`] //need i1 + * [x] [`_mm256_maskz_loadu_ps`] //need i1 * [x] [`_mm512_loadu_pd`] - * [ ] [`_mm512_mask_loadu_pd`] //need i1 - * [ ] [`_mm512_maskz_loadu_pd`] //need i1 - * [_] [`_mm_mask_loadu_pd`] //need i1 - * [_] [`_mm_maskz_loadu_pd`] //need i1 - * [_] [`_mm256_mask_loadu_pd`] //need i1 - * [_] [`_mm256_maskz_loadu_pd`] //need i1 + * [x] [`_mm512_mask_loadu_pd`] //need i1 + * [x] [`_mm512_maskz_loadu_pd`] //need i1 + * [x] [`_mm_mask_loadu_pd`] //need i1 + * [x] [`_mm_maskz_loadu_pd`] //need i1 + * [x] [`_mm256_mask_loadu_pd`] //need i1 + * [x] [`_mm256_maskz_loadu_pd`] //need i1 * [x] [`_mm512_loadu_si512`] * [x] [`_mm512_store_epi32`] - * [ ] [`_mm512_mask_store_epi32`] //need i1 - * [_] [`_mm_mask_store_epi32`] //need i1 + * [x] [`_mm512_mask_store_epi32`] //need i1 + * [x] [`_mm_mask_store_epi32`] //need i1 * [x] [`_mm_store_epi32`] - * [_] [`_mm256_mask_store_epi32`] //need i1 + * [x] [`_mm256_mask_store_epi32`] //need i1 * [x] [`_mm256_store_epi32`] * [x] [`_mm512_store_epi64`] - * [ ] [`_mm512_mask_store_epi64`] //need i1 - * [_] [`_mm_mask_store_epi64`] //need i1 + * [x] [`_mm512_mask_store_epi64`] //need i1 + * [x] [`_mm_mask_store_epi64`] //need i1 * [x] [`_mm_store_epi64`] - * [_] [`_mm256_mask_store_epi64`] //need i1 + * [x] [`_mm256_mask_store_epi64`] //need i1 * [x] [`_mm256_store_epi64`] * [x] [`_mm512_store_ps`] - * [ ] [`_mm512_mask_store_ps`] //need i1 - * [_] [`_mm_mask_store_ps`] //need i1 - * [_] [`_mm256_mask_store_ps`] //need i1 + * [x] [`_mm512_mask_store_ps`] //need i1 + * [x] [`_mm_mask_store_ps`] //need i1 + * [x] [`_mm256_mask_store_ps`] //need i1 * [x] [`_mm512_store_pd`] - * [ ] [`_mm512_mask_store_pd`] //need i1 - * [_] [`_mm_mask_store_pd`] //need i1 - * [_] [`_mm256_mask_store_pd`] //need i1 + * [x] [`_mm512_mask_store_pd`] //need i1 + * [x] [`_mm_mask_store_pd`] //need i1 + * [x] [`_mm256_mask_store_pd`] //need i1 * [x] [`_mm512_store_si512`] * [x] [`_mm512_storeu_epi32`] - * [ ] [`_mm512_mask_storeu_epi32`] //need i1 - * [_] [`_mm_mask_storeu_epi32`] //need i1 + * [x] [`_mm512_mask_storeu_epi32`] //need i1 + * [x] [`_mm_mask_storeu_epi32`] //need i1 * [x] [`_mm_storeu_epi32`] - * [_] [`_mm256_mask_storeu_epi32`] //need i1 + * [x] [`_mm256_mask_storeu_epi32`] //need i1 * [x] [`_mm256_storeu_epi32`] * [x] [`_mm512_storeu_epi64`] - * [ ] [`_mm512_mask_storeu_epi64`] //need i1 - * [_] [`_mm_mask_storeu_epi64`] //need i1 + * [x] [`_mm512_mask_storeu_epi64`] //need i1 + * [x] [`_mm_mask_storeu_epi64`] //need i1 * [x] [`_mm_storeu_epi64`] - * [_] [`_mm256_mask_storeu_epi64`] //need i1 + * [x] [`_mm256_mask_storeu_epi64`] //need i1 * [x] [`_mm256_storeu_epi64`] * [x] [`_mm512_storeu_ps`] - * [ ] [`_mm512_mask_storeu_ps`] //need i1 - * [_] [`_mm_mask_storeu_ps`] //need i1 - * [_] [`_mm256_mask_storeu_ps`] //need i1 + * [x] [`_mm512_mask_storeu_ps`] //need i1 + * [x] [`_mm_mask_storeu_ps`] //need i1 + * [x] [`_mm256_mask_storeu_ps`] //need i1 * [x] [`_mm512_storeu_pd`] - * [ ] [`_mm512_mask_storeu_pd`] //need i1 - * [_] [`_mm_mask_storeu_pd`] //need i1 - * [_] [`_mm256_mask_storeu_pd`] //need i1 + * [x] [`_mm512_mask_storeu_pd`] //need i1 + * [x] [`_mm_mask_storeu_pd`] //need i1 + * [x] [`_mm256_mask_storeu_pd`] //need i1 * [x] [`_mm512_storeu_si512`] * [ ] [`_mm512_stream_load_si512`] //stream_load_si256, ... not implment yet * [x] [`_mm512_stream_pd`] diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs index 10e0096339..0363004674 100644 --- a/crates/core_arch/src/x86/avx512bw.rs +++ b/crates/core_arch/src/x86/avx512bw.rs @@ -4227,6 +4227,330 @@ pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) { ptr::write_unaligned(mem_addr as *mut __m128i, a); } +/// Load packed 16-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *const i16) -> __m512i { + let mut dst: __m512i = src; + asm!( + "vmovdqu16 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 16-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i { + let mut dst: __m512i; + asm!( + "vmovdqu16 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 8-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *const i8) -> __m512i { + let mut dst: __m512i = src; + asm!( + "vmovdqu8 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 8-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i { + let mut dst: __m512i; + asm!( + "vmovdqu8 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 16-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *const i16) -> __m256i { + let mut dst: __m256i = src; + asm!( + "vmovdqu16 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 16-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i { + let mut dst: __m256i; + asm!( + "vmovdqu16 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 8-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *const i8) -> __m256i { + let mut dst: __m256i = src; + asm!( + "vmovdqu8 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 8-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i { + let mut dst: __m256i; + asm!( + "vmovdqu8 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 16-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i16) -> __m128i { + let mut dst: __m128i = src; + asm!( + "vmovdqu16 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 16-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] +pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i { + let mut dst: __m128i; + asm!( + "vmovdqu16 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 8-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i8) -> __m128i { + let mut dst: __m128i = src; + asm!( + "vmovdqu8 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 8-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] +pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i { + let mut dst: __m128i; + asm!( + "vmovdqu8 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Store packed 16-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: __m512i) { + asm!( + "vmovdqu16 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); +} + +/// Store packed 8-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m512i) { + asm!( + "vmovdqu8 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); +} + +/// Store packed 16-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: __m256i) { + asm!( + "vmovdqu16 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); +} + +/// Store packed 8-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m256i) { + asm!( + "vmovdqu8 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); +} + +/// Store packed 16-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m128i) { + asm!( + "vmovdqu16 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); +} + +/// Store packed 8-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128i) { + asm!( + "vmovdqu8 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); +} + /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_madd_epi16&expand=3511) @@ -13826,6 +14150,284 @@ mod tests { assert_eq_m128i(r, a); } + #[simd_test(enable = "avx512f,avx512bw")] + unsafe fn test_mm512_mask_loadu_epi16() { + let src = _mm512_set1_epi16(42); + let a = &[ + 1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let p = a.as_ptr(); + let m = 0b10101010_11001100_11101000_11001010; + let r = _mm512_mask_loadu_epi16(src, m, black_box(p)); + let e = &[ + 42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42, + 23, 24, 42, 26, 42, 28, 42, 30, 42, 32, + ]; + let e = _mm512_loadu_epi16(e.as_ptr()); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw")] + unsafe fn test_mm512_maskz_loadu_epi16() { + let a = &[ + 1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let p = a.as_ptr(); + let m = 0b10101010_11001100_11101000_11001010; + let r = _mm512_maskz_loadu_epi16(m, black_box(p)); + let e = &[ + 0_i16, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0, + 26, 0, 28, 0, 30, 0, 32, + ]; + let e = _mm512_loadu_epi16(e.as_ptr()); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw")] + unsafe fn test_mm512_mask_storeu_epi16() { + let mut r = [42_i16; 32]; + let a = &[ + 1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let a = _mm512_loadu_epi16(a.as_ptr()); + let m = 0b10101010_11001100_11101000_11001010; + _mm512_mask_storeu_epi16(r.as_mut_ptr(), m, a); + let e = &[ + 42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42, + 23, 24, 42, 26, 42, 28, 42, 30, 42, 32, + ]; + let e = _mm512_loadu_epi16(e.as_ptr()); + assert_eq_m512i(_mm512_loadu_epi16(r.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512bw")] + unsafe fn test_mm512_mask_loadu_epi8() { + let src = _mm512_set1_epi8(42); + let a = &[ + 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + ]; + let p = a.as_ptr(); + let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010; + let r = _mm512_mask_loadu_epi8(src, m, black_box(p)); + let e = &[ + 42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42, + 23, 24, 42, 26, 42, 28, 42, 30, 42, 32, 42, 42, 42, 42, 42, 42, 42, 42, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 42, 42, 42, 42, 42, 42, 42, 42, + ]; + let e = _mm512_loadu_epi8(e.as_ptr()); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw")] + unsafe fn test_mm512_maskz_loadu_epi8() { + let a = &[ + 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + ]; + let p = a.as_ptr(); + let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010; + let r = _mm512_maskz_loadu_epi8(m, black_box(p)); + let e = &[ + 0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0, + 26, 0, 28, 0, 30, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 41, 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + let e = _mm512_loadu_epi8(e.as_ptr()); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw")] + unsafe fn test_mm512_mask_storeu_epi8() { + let mut r = [42_i8; 64]; + let a = &[ + 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + ]; + let a = _mm512_loadu_epi8(a.as_ptr()); + let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010; + _mm512_mask_storeu_epi8(r.as_mut_ptr(), m, a); + let e = &[ + 42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42, + 23, 24, 42, 26, 42, 28, 42, 30, 42, 32, 42, 42, 42, 42, 42, 42, 42, 42, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 42, 42, 42, 42, 42, 42, 42, 42, + ]; + let e = _mm512_loadu_epi8(e.as_ptr()); + assert_eq_m512i(_mm512_loadu_epi8(r.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm256_mask_loadu_epi16() { + let src = _mm256_set1_epi16(42); + let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm256_mask_loadu_epi16(src, m, black_box(p)); + let e = &[ + 42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, + ]; + let e = _mm256_loadu_epi16(e.as_ptr()); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm256_maskz_loadu_epi16() { + let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm256_maskz_loadu_epi16(m, black_box(p)); + let e = &[0_i16, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16]; + let e = _mm256_loadu_epi16(e.as_ptr()); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm256_mask_storeu_epi16() { + let mut r = [42_i16; 16]; + let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let a = _mm256_loadu_epi16(a.as_ptr()); + let m = 0b11101000_11001010; + _mm256_mask_storeu_epi16(r.as_mut_ptr(), m, a); + let e = &[ + 42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, + ]; + let e = _mm256_loadu_epi16(e.as_ptr()); + assert_eq_m256i(_mm256_loadu_epi16(r.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm256_mask_loadu_epi8() { + let src = _mm256_set1_epi8(42); + let a = &[ + 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let p = a.as_ptr(); + let m = 0b10101010_11001100_11101000_11001010; + let r = _mm256_mask_loadu_epi8(src, m, black_box(p)); + let e = &[ + 42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42, + 23, 24, 42, 26, 42, 28, 42, 30, 42, 32, + ]; + let e = _mm256_loadu_epi8(e.as_ptr()); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm256_maskz_loadu_epi8() { + let a = &[ + 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let p = a.as_ptr(); + let m = 0b10101010_11001100_11101000_11001010; + let r = _mm256_maskz_loadu_epi8(m, black_box(p)); + let e = &[ + 0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0, + 26, 0, 28, 0, 30, 0, 32, + ]; + let e = _mm256_loadu_epi8(e.as_ptr()); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm256_mask_storeu_epi8() { + let mut r = [42_i8; 32]; + let a = &[ + 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let a = _mm256_loadu_epi8(a.as_ptr()); + let m = 0b10101010_11001100_11101000_11001010; + _mm256_mask_storeu_epi8(r.as_mut_ptr(), m, a); + let e = &[ + 42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42, + 23, 24, 42, 26, 42, 28, 42, 30, 42, 32, + ]; + let e = _mm256_loadu_epi8(e.as_ptr()); + assert_eq_m256i(_mm256_loadu_epi8(r.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm_mask_loadu_epi16() { + let src = _mm_set1_epi16(42); + let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm_mask_loadu_epi16(src, m, black_box(p)); + let e = &[42_i16, 2, 42, 4, 42, 42, 7, 8]; + let e = _mm_loadu_epi16(e.as_ptr()); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm_maskz_loadu_epi16() { + let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm_maskz_loadu_epi16(m, black_box(p)); + let e = &[0_i16, 2, 0, 4, 0, 0, 7, 8]; + let e = _mm_loadu_epi16(e.as_ptr()); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm_mask_storeu_epi16() { + let mut r = [42_i16; 8]; + let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8]; + let a = _mm_loadu_epi16(a.as_ptr()); + let m = 0b11001010; + _mm_mask_storeu_epi16(r.as_mut_ptr(), m, a); + let e = &[42_i16, 2, 42, 4, 42, 42, 7, 8]; + let e = _mm_loadu_epi16(e.as_ptr()); + assert_eq_m128i(_mm_loadu_epi16(r.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm_mask_loadu_epi8() { + let src = _mm_set1_epi8(42); + let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm_mask_loadu_epi8(src, m, black_box(p)); + let e = &[ + 42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, + ]; + let e = _mm_loadu_epi8(e.as_ptr()); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm_maskz_loadu_epi8() { + let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm_maskz_loadu_epi8(m, black_box(p)); + let e = &[0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16]; + let e = _mm_loadu_epi8(e.as_ptr()); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm_mask_storeu_epi8() { + let mut r = [42_i8; 16]; + let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let a = _mm_loadu_epi8(a.as_ptr()); + let m = 0b11101000_11001010; + _mm_mask_storeu_epi8(r.as_mut_ptr(), m, a); + let e = &[ + 42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, + ]; + let e = _mm_loadu_epi8(e.as_ptr()); + assert_eq_m128i(_mm_loadu_epi8(r.as_ptr()), e); + } + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_madd_epi16() { let a = _mm512_set1_epi16(1); diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 766acf46f7..7633442aa6 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -30323,2842 +30323,4138 @@ pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) { ptr::write(mem_addr as *mut __m512d, a); } -/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order. +/// Load packed 32-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_pd&expand=5002) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi32) #[inline] #[target_feature(enable = "avx512f")] -pub unsafe fn _mm512_setr_pd( - e0: f64, - e1: f64, - e2: f64, - e3: f64, - e4: f64, - e5: f64, - e6: f64, - e7: f64, -) -> __m512d { - let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7); - transmute(r) +pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i { + let mut dst: __m512i = src; + asm!( + "vmovdqu32 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values. +/// Load packed 32-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_pd&expand=4924) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi32) #[inline] #[target_feature(enable = "avx512f")] -pub unsafe fn _mm512_set_pd( - e0: f64, - e1: f64, - e2: f64, - e3: f64, - e4: f64, - e5: f64, - e6: f64, - e7: f64, -) -> __m512d { - _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0) +pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i { + let mut dst: __m512i; + asm!( + "vmovdqu32 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed 64-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_ss&expand=3832) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi64) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovss))] -pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { - let extractsrc: f32 = simd_extract(src, 0); - let mut mov: f32 = extractsrc; - if (k & 0b00000001) != 0 { - mov = simd_extract(b, 0); - } - let r = simd_insert(a, 0, mov); - transmute(r) +pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i { + let mut dst: __m512i = src; + asm!( + "vmovdqu64 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed 64-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_ss&expand=3833) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi64) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovss))] -pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - let mut mov: f32 = 0.; - if (k & 0b00000001) != 0 { - mov = simd_extract(b, 0); - } - let r = simd_insert(a, 0, mov); - transmute(r) +pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i { + let mut dst: __m512i; + asm!( + "vmovdqu64 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_sd&expand=3829) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovsd))] -pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - let extractsrc: f64 = simd_extract(src, 0); - let mut mov: f64 = extractsrc; - if (k & 0b00000001) != 0 { - mov = simd_extract(b, 0); - } - let r = simd_insert(a, 0, mov); - transmute(r) +pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 { + let mut dst: __m512 = src; + asm!( + "vmovups {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_sd&expand=3830) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovsd))] -pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - let mut mov: f64 = 0.; - if (k & 0b00000001) != 0 { - mov = simd_extract(b, 0); - } - let r = simd_insert(a, 0, mov); - transmute(r) +pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 { + let mut dst: __m512; + asm!( + "vmovups {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_ss&expand=159) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddss))] -pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { - let extractsrc: f32 = simd_extract(src, 0); - let mut add: f32 = extractsrc; - if (k & 0b00000001) != 0 { - let extracta: f32 = simd_extract(a, 0); - let extractb: f32 = simd_extract(b, 0); - add = extracta + extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d { + let mut dst: __m512d = src; + asm!( + "vmovupd {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_ss&expand=160) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddss))] -pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - let mut add: f32 = 0.; - if (k & 0b00000001) != 0 { - let extracta: f32 = simd_extract(a, 0); - let extractb: f32 = simd_extract(b, 0); - add = extracta + extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d { + let mut dst: __m512d; + asm!( + "vmovupd {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed 32-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_sd&expand=155) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi32) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddsd))] -pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - let extractsrc: f64 = simd_extract(src, 0); - let mut add: f64 = extractsrc; - if (k & 0b00000001) != 0 { - let extracta: f64 = simd_extract(a, 0); - let extractb: f64 = simd_extract(b, 0); - add = extracta + extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i { + let mut dst: __m256i = src; + asm!( + "vmovdqu32 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed 32-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_sd&expand=156) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi32) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddsd))] -pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - let mut add: f64 = 0.; - if (k & 0b00000001) != 0 { - let extracta: f64 = simd_extract(a, 0); - let extractb: f64 = simd_extract(b, 0); - add = extracta + extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i { + let mut dst: __m256i; + asm!( + "vmovdqu32 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed 64-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_ss&expand=5750) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi64) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubss))] -pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { - let extractsrc: f32 = simd_extract(src, 0); - let mut add: f32 = extractsrc; - if (k & 0b00000001) != 0 { - let extracta: f32 = simd_extract(a, 0); - let extractb: f32 = simd_extract(b, 0); - add = extracta - extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i { + let mut dst: __m256i = src; + asm!( + "vmovdqu64 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed 64-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_ss&expand=5751) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi64) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubss))] -pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - let mut add: f32 = 0.; - if (k & 0b00000001) != 0 { - let extracta: f32 = simd_extract(a, 0); - let extractb: f32 = simd_extract(b, 0); - add = extracta - extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i { + let mut dst: __m256i; + asm!( + "vmovdqu64 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_sd&expand=5746) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_ps) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubsd))] -pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - let extractsrc: f64 = simd_extract(src, 0); - let mut add: f64 = extractsrc; - if (k & 0b00000001) != 0 { - let extracta: f64 = simd_extract(a, 0); - let extractb: f64 = simd_extract(b, 0); - add = extracta - extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 { + let mut dst: __m256 = src; + asm!( + "vmovups {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_sd&expand=5747) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_ps) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubsd))] -pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - let mut add: f64 = 0.; - if (k & 0b00000001) != 0 { - let extracta: f64 = simd_extract(a, 0); - let extractb: f64 = simd_extract(b, 0); - add = extracta - extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 { + let mut dst: __m256; + asm!( + "vmovups {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_ss&expand=3950) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_pd) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulss))] -pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { - let extractsrc: f32 = simd_extract(src, 0); - let mut add: f32 = extractsrc; - if (k & 0b00000001) != 0 { - let extracta: f32 = simd_extract(a, 0); - let extractb: f32 = simd_extract(b, 0); - add = extracta * extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d { + let mut dst: __m256d = src; + asm!( + "vmovupd {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_ss&expand=3951) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_pd) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulss))] -pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - let mut add: f32 = 0.; - if (k & 0b00000001) != 0 { - let extracta: f32 = simd_extract(a, 0); - let extractb: f32 = simd_extract(b, 0); - add = extracta * extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d { + let mut dst: __m256d; + asm!( + "vmovupd {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed 32-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_sd&expand=3947) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi32) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulsd))] -pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - let extractsrc: f64 = simd_extract(src, 0); - let mut add: f64 = extractsrc; - if (k & 0b00000001) != 0 { - let extracta: f64 = simd_extract(a, 0); - let extractb: f64 = simd_extract(b, 0); - add = extracta * extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i { + let mut dst: __m128i = src; + asm!( + "vmovdqu32 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed 32-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_sd&expand=3948) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi32) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulsd))] -pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - let mut add: f64 = 0.; - if (k & 0b00000001) != 0 { - let extracta: f64 = simd_extract(a, 0); - let extractb: f64 = simd_extract(b, 0); - add = extracta * extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i { + let mut dst: __m128i; + asm!( + "vmovdqu32 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed 64-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_ss&expand=2181) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi64) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivss))] -pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { - let extractsrc: f32 = simd_extract(src, 0); - let mut add: f32 = extractsrc; - if (k & 0b00000001) != 0 { - let extracta: f32 = simd_extract(a, 0); - let extractb: f32 = simd_extract(b, 0); - add = extracta / extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i { + let mut dst: __m128i = src; + asm!( + "vmovdqu64 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed 64-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_ss&expand=2182) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi64) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivss))] -pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - let mut add: f32 = 0.; - if (k & 0b00000001) != 0 { - let extracta: f32 = simd_extract(a, 0); - let extractb: f32 = simd_extract(b, 0); - add = extracta / extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i { + let mut dst: __m128i; + asm!( + "vmovdqu64 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_sd&expand=2178) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_ps) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivsd))] -pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - let extractsrc: f64 = simd_extract(src, 0); - let mut add: f64 = extractsrc; - if (k & 0b00000001) != 0 { - let extracta: f64 = simd_extract(a, 0); - let extractb: f64 = simd_extract(b, 0); - add = extracta / extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { + let mut dst: __m128 = src; + asm!( + "vmovups {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_sd&expand=2179) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_ps) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivsd))] -pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - let mut add: f64 = 0.; - if (k & 0b00000001) != 0 { - let extracta: f64 = simd_extract(a, 0); - let extractb: f64 = simd_extract(b, 0); - add = extracta / extractb; - } - let r = simd_insert(a, 0, add); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { + let mut dst: __m128; + asm!( + "vmovups {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_pd) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxss))] -pub unsafe fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vmaxss( - a.as_f32x4(), - b.as_f32x4(), - src.as_f32x4(), - k, - _MM_FROUND_CUR_DIRECTION, - )) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { + let mut dst: __m128d = src; + asm!( + "vmovupd {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_ss&expand=3673) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_pd) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxss))] -pub unsafe fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vmaxss( - a.as_f32x4(), - b.as_f32x4(), - _mm_setzero_ps().as_f32x4(), - k, - _MM_FROUND_CUR_DIRECTION, - )) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { + let mut dst: __m128d; + asm!( + "vmovupd {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed 32-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_sd&expand=3669) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi32) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxsd))] -pub unsafe fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vmaxsd( - a.as_f64x2(), - b.as_f64x2(), - src.as_f64x2(), - k, - _MM_FROUND_CUR_DIRECTION, - )) +pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i { + let mut dst: __m512i = src; + asm!( + "vmovdqa32 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed 32-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi32) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxsd))] -pub unsafe fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vmaxsd( - a.as_f64x2(), - b.as_f64x2(), - _mm_setzero_pd().as_f64x2(), - k, - _MM_FROUND_CUR_DIRECTION, - )) +pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i { + let mut dst: __m512i; + asm!( + "vmovdqa32 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed 64-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_ss&expand=3786) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi64) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminss))] -pub unsafe fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vminss( - a.as_f32x4(), - b.as_f32x4(), - src.as_f32x4(), - k, - _MM_FROUND_CUR_DIRECTION, - )) +pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i { + let mut dst: __m512i = src; + asm!( + "vmovdqa64 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed 64-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_ss&expand=3787) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi64) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminss))] -pub unsafe fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vminss( - a.as_f32x4(), - b.as_f32x4(), - _mm_setzero_ps().as_f32x4(), - k, - _MM_FROUND_CUR_DIRECTION, - )) +pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i { + let mut dst: __m512i; + asm!( + "vmovdqa64 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_sd&expand=3783) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminsd))] -pub unsafe fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vminsd( - a.as_f64x2(), - b.as_f64x2(), - src.as_f64x2(), - k, - _MM_FROUND_CUR_DIRECTION, - )) +pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 { + let mut dst: __m512 = src; + asm!( + "vmovaps {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_sd&expand=3784) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminsd))] -pub unsafe fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vminsd( - a.as_f64x2(), - b.as_f64x2(), - _mm_setzero_pd().as_f64x2(), - k, - _MM_FROUND_CUR_DIRECTION, - )) +pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 { + let mut dst: __m512; + asm!( + "vmovaps {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_ss&expand=5387) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtss))] -pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vsqrtss( - a.as_f32x4(), - b.as_f32x4(), - src.as_f32x4(), - k, - _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, - )) +pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d { + let mut dst: __m512d = src; + asm!( + "vmovapd {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_ss&expand=5388) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtss))] -pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vsqrtss( - a.as_f32x4(), - b.as_f32x4(), - _mm_setzero_ps().as_f32x4(), - k, - _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, - )) +pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d { + let mut dst: __m512d; + asm!( + "vmovapd {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed 32-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_sd&expand=5384) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_epi32) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtsd))] -pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vsqrtsd( - a.as_f64x2(), - b.as_f64x2(), - src.as_f64x2(), - k, - _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, - )) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i { + let mut dst: __m256i = src; + asm!( + "vmovdqa32 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Load packed 32-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_sd&expand=5385) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_epi32) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtsd))] -pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vsqrtsd( - a.as_f64x2(), - b.as_f64x2(), - _mm_setzero_pd().as_f64x2(), - k, - _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, - )) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i { + let mut dst: __m256i; + asm!( + "vmovdqa32 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. +/// Load packed 64-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_ss&expand=4825) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_epi64) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrsqrt14ss))] -pub unsafe fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 { - transmute(vrsqrt14ss( - a.as_f32x4(), - b.as_f32x4(), - _mm_setzero_ps().as_f32x4(), - 0b1, - )) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i { + let mut dst: __m256i = src; + asm!( + "vmovdqa64 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. +/// Load packed 64-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_ss&expand=4823) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_epi64) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrsqrt14ss))] -pub unsafe fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i { + let mut dst: __m256i; + asm!( + "vmovdqa64 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_ss&expand=4824) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_ps) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrsqrt14ss))] -pub unsafe fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vrsqrt14ss( - a.as_f32x4(), - b.as_f32x4(), - _mm_setzero_ps().as_f32x4(), - k, - )) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 { + let mut dst: __m256 = src; + asm!( + "vmovaps {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_sd&expand=4822) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_ps) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrsqrt14sd))] -pub unsafe fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d { - transmute(vrsqrt14sd( - a.as_f64x2(), - b.as_f64x2(), - _mm_setzero_pd().as_f64x2(), - 0b1, - )) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 { + let mut dst: __m256; + asm!( + "vmovaps {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_sd&expand=4820) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_pd) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrsqrt14sd))] -pub unsafe fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d { + let mut dst: __m256d = src; + asm!( + "vmovapd {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_sd&expand=4821) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_pd) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrsqrt14sd))] -pub unsafe fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vrsqrt14sd( - a.as_f64x2(), - b.as_f64x2(), - _mm_setzero_pd().as_f64x2(), - k, - )) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d { + let mut dst: __m256d; + asm!( + "vmovapd {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. +/// Load packed 32-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_ss&expand=4508) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_epi32) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrcp14ss))] -pub unsafe fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 { - transmute(vrcp14ss( - a.as_f32x4(), - b.as_f32x4(), - _mm_setzero_ps().as_f32x4(), - 0b1, - )) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i { + let mut dst: __m128i = src; + asm!( + "vmovdqa32 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. +/// Load packed 32-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_ss&expand=4506) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_epi32) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrcp14ss))] -pub unsafe fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i { + let mut dst: __m128i; + asm!( + "vmovdqa32 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. +/// Load packed 64-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_ss&expand=4507) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_epi64) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrcp14ss))] -pub unsafe fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vrcp14ss( - a.as_f32x4(), - b.as_f32x4(), - _mm_setzero_ps().as_f32x4(), - k, - )) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i { + let mut dst: __m128i = src; + asm!( + "vmovdqa64 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. +/// Load packed 64-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_sd&expand=4505) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_epi64) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrcp14sd))] -pub unsafe fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d { - transmute(vrcp14sd( - a.as_f64x2(), - b.as_f64x2(), - _mm_setzero_pd().as_f64x2(), - 0b1, - )) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i { + let mut dst: __m128i; + asm!( + "vmovdqa64 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_sd&expand=4503) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ps) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrcp14sd))] -pub unsafe fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { + let mut dst: __m128 = src; + asm!( + "vmovaps {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_sd&expand=4504) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ps) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrcp14sd))] -pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vrcp14sd( - a.as_f64x2(), - b.as_f64x2(), - _mm_setzero_pd().as_f64x2(), - k, - )) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { + let mut dst: __m128; + asm!( + "vmovaps {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_ss&expand=2862) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_pd) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetexpss))] -pub unsafe fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 { - transmute(vgetexpss( - a.as_f32x4(), - b.as_f32x4(), - _mm_setzero_ps().as_f32x4(), - 0b1, - _MM_FROUND_NO_EXC, - )) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { + let mut dst: __m128d = src; + asm!( + "vmovapd {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_ss&expand=2863) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_pd) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetexpss))] -pub unsafe fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vgetexpss( - a.as_f32x4(), - b.as_f32x4(), - src.as_f32x4(), - k, - _MM_FROUND_NO_EXC, - )) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { + let mut dst: __m128d; + asm!( + "vmovapd {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst } -/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. +/// Store packed 32-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_ss&expand=2864) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi32) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetexpss))] -pub unsafe fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vgetexpss( - a.as_f32x4(), - b.as_f32x4(), - _mm_setzero_ps().as_f32x4(), - k, - _MM_FROUND_NO_EXC, - )) +pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) { + asm!( + "vmovdqu32 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); } -/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. +/// Store packed 64-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_sd&expand=2859) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi64) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetexpsd))] -pub unsafe fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d { - transmute(vgetexpsd( - a.as_f64x2(), - b.as_f64x2(), - _mm_setzero_pd().as_f64x2(), - 0b1, - _MM_FROUND_NO_EXC, - )) +pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) { + asm!( + "vmovdqu64 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); } -/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. +/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_sd&expand=2860) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetexpsd))] -pub unsafe fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vgetexpsd( - a.as_f64x2(), - b.as_f64x2(), - src.as_f64x2(), - k, - _MM_FROUND_NO_EXC, - )) +pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) { + asm!( + "vmovups [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); } -/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. +/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_sd&expand=2861) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetexpsd))] -pub unsafe fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vgetexpsd( - a.as_f64x2(), - b.as_f64x2(), - _mm_setzero_pd().as_f64x2(), - k, - _MM_FROUND_NO_EXC, - )) +pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) { + asm!( + "vmovupd [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); } -/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\ -/// The mantissa is normalized to the interval specified by interv, which can take the following values:\ -/// _MM_MANT_NORM_1_2 // interval [1, 2)\ -/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)\ -/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)\ -/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\ -/// The sign is determined by sc which can take the following values:\ -/// _MM_MANT_SIGN_src // sign = sign(src)\ -/// _MM_MANT_SIGN_zero // sign = 0\ -/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Store packed 32-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_ss&expand=2898) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi32) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))] -#[rustc_legacy_const_generics(2, 3)] -pub unsafe fn _mm_getmant_ss< - const NORM: _MM_MANTISSA_NORM_ENUM, - const SIGN: _MM_MANTISSA_SIGN_ENUM, ->( - a: __m128, - b: __m128, -) -> __m128 { - static_assert_imm4!(NORM); - static_assert_imm2!(SIGN); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, 0b1, _MM_FROUND_CUR_DIRECTION); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) { + asm!( + "vmovdqu32 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); } -/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\ -/// The mantissa is normalized to the interval specified by interv, which can take the following values:\ -/// _MM_MANT_NORM_1_2 // interval [1, 2)\ -/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)\ -/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)\ -/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\ -/// The sign is determined by sc which can take the following values:\ -/// _MM_MANT_SIGN_src // sign = sign(src)\ -/// _MM_MANT_SIGN_zero // sign = 0\ -/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Store packed 64-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_ss&expand=2899) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi64) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))] -#[rustc_legacy_const_generics(4, 5)] -pub unsafe fn _mm_mask_getmant_ss< - const NORM: _MM_MANTISSA_NORM_ENUM, - const SIGN: _MM_MANTISSA_SIGN_ENUM, ->( - src: __m128, - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_imm4!(NORM); - static_assert_imm2!(SIGN); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let src = src.as_f32x4(); - let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) { + asm!( + "vmovdqu64 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); } -/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\ -/// The mantissa is normalized to the interval specified by interv, which can take the following values:\ -/// _MM_MANT_NORM_1_2 // interval [1, 2)\ -/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)\ -/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)\ -/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\ -/// The sign is determined by sc which can take the following values:\ -/// _MM_MANT_SIGN_src // sign = sign(src)\ -/// _MM_MANT_SIGN_zero // sign = 0\ -/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_ss&expand=2900) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_ps) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))] -#[rustc_legacy_const_generics(3, 4)] -pub unsafe fn _mm_maskz_getmant_ss< - const NORM: _MM_MANTISSA_NORM_ENUM, - const SIGN: _MM_MANTISSA_SIGN_ENUM, ->( - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_imm4!(NORM); - static_assert_imm2!(SIGN); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) { + asm!( + "vmovups [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); } -/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\ -/// The mantissa is normalized to the interval specified by interv, which can take the following values:\ -/// _MM_MANT_NORM_1_2 // interval [1, 2)\ -/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)\ -/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)\ -/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\ -/// The sign is determined by sc which can take the following values:\ -/// _MM_MANT_SIGN_src // sign = sign(src)\ -/// _MM_MANT_SIGN_zero // sign = 0\ -/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_sd&expand=2895) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_pd) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))] -#[rustc_legacy_const_generics(2, 3)] -pub unsafe fn _mm_getmant_sd< - const NORM: _MM_MANTISSA_NORM_ENUM, - const SIGN: _MM_MANTISSA_SIGN_ENUM, ->( - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_imm4!(NORM); - static_assert_imm2!(SIGN); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, 0b1, _MM_FROUND_CUR_DIRECTION); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) { + asm!( + "vmovupd [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); } -/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\ -/// The mantissa is normalized to the interval specified by interv, which can take the following values:\ -/// _MM_MANT_NORM_1_2 // interval [1, 2)\ -/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)\ -/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)\ -/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\ -/// The sign is determined by sc which can take the following values:\ -/// _MM_MANT_SIGN_src // sign = sign(src)\ -/// _MM_MANT_SIGN_zero // sign = 0\ -/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Store packed 32-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_sd&expand=2896) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi32) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))] -#[rustc_legacy_const_generics(4, 5)] -pub unsafe fn _mm_mask_getmant_sd< - const NORM: _MM_MANTISSA_NORM_ENUM, - const SIGN: _MM_MANTISSA_SIGN_ENUM, ->( - src: __m128d, - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_imm4!(NORM); - static_assert_imm2!(SIGN); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let src = src.as_f64x2(); - let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) { + asm!( + "vmovdqu32 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); } -/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\ -/// The mantissa is normalized to the interval specified by interv, which can take the following values:\ -/// _MM_MANT_NORM_1_2 // interval [1, 2)\ -/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)\ -/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)\ -/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\ -/// The sign is determined by sc which can take the following values:\ -/// _MM_MANT_SIGN_src // sign = sign(src)\ -/// _MM_MANT_SIGN_zero // sign = 0\ -/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Store packed 64-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_sd&expand=2897) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi64) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))] -#[rustc_legacy_const_generics(3, 4)] -pub unsafe fn _mm_maskz_getmant_sd< - const NORM: _MM_MANTISSA_NORM_ENUM, - const SIGN: _MM_MANTISSA_SIGN_ENUM, ->( - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_imm4!(NORM); - static_assert_imm2!(SIGN); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) { + asm!( + "vmovdqu64 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); } -/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\ -/// _MM_FROUND_TO_NEAREST_INT // round to nearest\ -/// _MM_FROUND_TO_NEG_INF // round down\ -/// _MM_FROUND_TO_POS_INF // round up\ -/// _MM_FROUND_TO_ZERO // truncate\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_ss&expand=4802) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_ps) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 255))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_roundscale_ss(a: __m128, b: __m128) -> __m128 { - static_assert_imm8!(IMM8); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vrndscaless(a, b, zero, 0b11111111, IMM8, _MM_FROUND_CUR_DIRECTION); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { + asm!( + "vmovups [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); } -/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\ -/// _MM_FROUND_TO_NEAREST_INT // round to nearest\ -/// _MM_FROUND_TO_NEG_INF // round down\ -/// _MM_FROUND_TO_POS_INF // round up\ -/// _MM_FROUND_TO_ZERO // truncate\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_ss&expand=4800) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_pd) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_roundscale_ss( - src: __m128, - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_imm8!(IMM8); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let src = src.as_f32x4(); - let r = vrndscaless(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) { + asm!( + "vmovupd [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); } -/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\ -/// _MM_FROUND_TO_NEAREST_INT // round to nearest\ -/// _MM_FROUND_TO_NEG_INF // round down\ -/// _MM_FROUND_TO_POS_INF // round up\ -/// _MM_FROUND_TO_ZERO // truncate\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Store packed 32-bit integers from a into memory using writemask k. +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_ss&expand=4801) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_epi32) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_roundscale_ss( - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_imm8!(IMM8); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vrndscaless(a, b, zero, k, IMM8, _MM_FROUND_CUR_DIRECTION); - transmute(r) +pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) { + asm!( + "vmovdqa32 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); } -/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ -/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\ -/// _MM_FROUND_TO_NEAREST_INT // round to nearest\ -/// _MM_FROUND_TO_NEG_INF // round down\ -/// _MM_FROUND_TO_POS_INF // round up\ -/// _MM_FROUND_TO_ZERO // truncate\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Store packed 64-bit integers from a into memory using writemask k. +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_sd&expand=4799) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_epi64) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 255))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_roundscale_sd(a: __m128d, b: __m128d) -> __m128d { - static_assert_imm8!(IMM8); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vrndscalesd(a, b, zero, 0b11111111, IMM8, _MM_FROUND_CUR_DIRECTION); - transmute(r) +pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) { + asm!( + "vmovdqa64 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); } -/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ -/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\ -/// _MM_FROUND_TO_NEAREST_INT // round to nearest\ -/// _MM_FROUND_TO_NEG_INF // round down\ -/// _MM_FROUND_TO_POS_INF // round up\ -/// _MM_FROUND_TO_ZERO // truncate\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_sd&expand=4797) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_roundscale_sd( - src: __m128d, - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_imm8!(IMM8); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let src = src.as_f64x2(); - let r = vrndscalesd(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION); - transmute(r) +pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) { + asm!( + "vmovaps [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); } -/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ -/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\ -/// _MM_FROUND_TO_NEAREST_INT // round to nearest\ -/// _MM_FROUND_TO_NEG_INF // round down\ -/// _MM_FROUND_TO_POS_INF // round up\ -/// _MM_FROUND_TO_ZERO // truncate\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_sd&expand=4798) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_roundscale_sd( - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_imm8!(IMM8); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vrndscalesd(a, b, zero, k, IMM8, _MM_FROUND_CUR_DIRECTION); - transmute(r) +pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) { + asm!( + "vmovapd [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); } -/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// Store packed 32-bit integers from a into memory using writemask k. +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_ss&expand=4901) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_epi32) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vscalefss))] -pub unsafe fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 { - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - transmute(vscalefss(a, b, zero, 0b11111111, _MM_FROUND_CUR_DIRECTION)) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) { + asm!( + "vmovdqa32 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); } -/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Store packed 64-bit integers from a into memory using writemask k. +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_ss&expand=4899) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_epi64) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vscalefss))] -pub unsafe fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let src = src.as_f32x4(); - transmute(vscalefss(a, b, src, k, _MM_FROUND_CUR_DIRECTION)) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) { + asm!( + "vmovdqa64 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); } -/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_ss&expand=4900) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_ps) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vscalefss))] -pub unsafe fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - transmute(vscalefss( - a.as_f32x4(), - b.as_f32x4(), - _mm_setzero_ps().as_f32x4(), - k, - _MM_FROUND_CUR_DIRECTION, - )) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) { + asm!( + "vmovaps [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); } -/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_sd&expand=4898) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_pd) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vscalefsd))] -pub unsafe fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d { - transmute(vscalefsd( - a.as_f64x2(), - b.as_f64x2(), - _mm_setzero_pd().as_f64x2(), - 0b11111111, - _MM_FROUND_CUR_DIRECTION, - )) +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) { + asm!( + "vmovapd [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); } -/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Store packed 32-bit integers from a into memory using writemask k. +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_sd&expand=4896) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_epi32) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vscalefsd))] -pub unsafe fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vscalefsd( - a.as_f64x2(), - b.as_f64x2(), - src.as_f64x2(), - k, - _MM_FROUND_CUR_DIRECTION, - )) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) { + asm!( + "vmovdqa32 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); } -/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Store packed 64-bit integers from a into memory using writemask k. +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_sd&expand=4897) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_epi64) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vscalefsd))] -pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { - transmute(vscalefsd( - a.as_f64x2(), - b.as_f64x2(), - _mm_setzero_pd().as_f64x2(), - k, - _MM_FROUND_CUR_DIRECTION, - )) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) { + asm!( + "vmovdqa64 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); } -/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_ss&expand=2582) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ps) #[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfmadd213ss))] -pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { - let mut fmadd: f32 = simd_extract(a, 0); - if (k & 0b00000001) != 0 { - let extractb: f32 = simd_extract(b, 0); - let extractc: f32 = simd_extract(c, 0); - fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION); - } - let r = simd_insert(a, 0, fmadd); - transmute(r) +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { + asm!( + "vmovaps [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); } -/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_ss&expand=2584) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] +pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) { + asm!( + "vmovapd [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); +} + +/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order. +/// +/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_pd&expand=5002) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfmadd213ss))] -pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { - let mut fmadd: f32 = 0.; - if (k & 0b00000001) != 0 { - let extracta: f32 = simd_extract(a, 0); - let extractb: f32 = simd_extract(b, 0); - let extractc: f32 = simd_extract(c, 0); - fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); - } - let r = simd_insert(a, 0, fmadd); +pub unsafe fn _mm512_setr_pd( + e0: f64, + e1: f64, + e2: f64, + e3: f64, + e4: f64, + e5: f64, + e6: f64, + e7: f64, +) -> __m512d { + let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7); transmute(r) } -/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst. +/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_ss&expand=2583) +/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_pd&expand=4924) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfmadd213ss))] -pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { - let mut fmadd: f32 = simd_extract(c, 0); - if (k & 0b00000001) != 0 { - let extracta: f32 = simd_extract(a, 0); - let extractb: f32 = simd_extract(b, 0); - fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION); - } - let r = simd_insert(c, 0, fmadd); - transmute(r) +pub unsafe fn _mm512_set_pd( + e0: f64, + e1: f64, + e2: f64, + e3: f64, + e4: f64, + e5: f64, + e6: f64, + e7: f64, +) -> __m512d { + _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0) } -/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_sd&expand=2578) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_ss&expand=3832) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfmadd213sd))] -pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { - let mut fmadd: f64 = simd_extract(a, 0); +#[cfg_attr(test, assert_instr(vmovss))] +pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + let extractsrc: f32 = simd_extract(src, 0); + let mut mov: f32 = extractsrc; if (k & 0b00000001) != 0 { - let extractb: f64 = simd_extract(b, 0); - let extractc: f64 = simd_extract(c, 0); - fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + mov = simd_extract(b, 0); } - let r = simd_insert(a, 0, fmadd); + let r = simd_insert(a, 0, mov); transmute(r) } -/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_sd&expand=2580) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_ss&expand=3833) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfmadd213sd))] -pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { - let mut fmadd: f64 = 0.; +#[cfg_attr(test, assert_instr(vmovss))] +pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + let mut mov: f32 = 0.; if (k & 0b00000001) != 0 { - let extracta: f64 = simd_extract(a, 0); - let extractb: f64 = simd_extract(b, 0); - let extractc: f64 = simd_extract(c, 0); - fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + mov = simd_extract(b, 0); } - let r = simd_insert(a, 0, fmadd); + let r = simd_insert(a, 0, mov); transmute(r) } -/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst. +/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_sd&expand=2579) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_sd&expand=3829) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfmadd213sd))] -pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { - let mut fmadd: f64 = simd_extract(c, 0); +#[cfg_attr(test, assert_instr(vmovsd))] +pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let extractsrc: f64 = simd_extract(src, 0); + let mut mov: f64 = extractsrc; if (k & 0b00000001) != 0 { - let extracta: f64 = simd_extract(a, 0); - let extractb: f64 = simd_extract(b, 0); - fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION); + mov = simd_extract(b, 0); } - let r = simd_insert(c, 0, fmadd); + let r = simd_insert(a, 0, mov); transmute(r) } -/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_ss&expand=2668) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_sd&expand=3830) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfmsub213ss))] -pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { - let mut fmsub: f32 = simd_extract(a, 0); +#[cfg_attr(test, assert_instr(vmovsd))] +pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let mut mov: f64 = 0.; if (k & 0b00000001) != 0 { - let extractb: f32 = simd_extract(b, 0); - let extractc: f32 = simd_extract(c, 0); - let extractc = -extractc; - fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + mov = simd_extract(b, 0); } - let r = simd_insert(a, 0, fmsub); + let r = simd_insert(a, 0, mov); transmute(r) } -/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_ss&expand=2670) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_ss&expand=159) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfmsub213ss))] -pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { - let mut fmsub: f32 = 0.; +#[cfg_attr(test, assert_instr(vaddss))] +pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + let extractsrc: f32 = simd_extract(src, 0); + let mut add: f32 = extractsrc; if (k & 0b00000001) != 0 { let extracta: f32 = simd_extract(a, 0); let extractb: f32 = simd_extract(b, 0); - let extractc: f32 = simd_extract(c, 0); - let extractc = -extractc; - fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + add = extracta + extractb; } - let r = simd_insert(a, 0, fmsub); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst. +/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_ss&expand=2669) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_ss&expand=160) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfmsub213ss))] -pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { - let mut fmsub: f32 = simd_extract(c, 0); +#[cfg_attr(test, assert_instr(vaddss))] +pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + let mut add: f32 = 0.; if (k & 0b00000001) != 0 { let extracta: f32 = simd_extract(a, 0); let extractb: f32 = simd_extract(b, 0); - let extractc = -fmsub; - fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + add = extracta + extractb; } - let r = simd_insert(c, 0, fmsub); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_sd&expand=2664) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_sd&expand=155) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfmsub213sd))] -pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { - let mut fmsub: f64 = simd_extract(a, 0); +#[cfg_attr(test, assert_instr(vaddsd))] +pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let extractsrc: f64 = simd_extract(src, 0); + let mut add: f64 = extractsrc; if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); let extractb: f64 = simd_extract(b, 0); - let extractc: f64 = simd_extract(c, 0); - let extractc = -extractc; - fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + add = extracta + extractb; } - let r = simd_insert(a, 0, fmsub); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_sd&expand=2666) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_sd&expand=156) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfmsub213sd))] -pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { - let mut fmsub: f64 = 0.; +#[cfg_attr(test, assert_instr(vaddsd))] +pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let mut add: f64 = 0.; if (k & 0b00000001) != 0 { let extracta: f64 = simd_extract(a, 0); let extractb: f64 = simd_extract(b, 0); - let extractc: f64 = simd_extract(c, 0); - let extractc = -extractc; - fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + add = extracta + extractb; } - let r = simd_insert(a, 0, fmsub); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst. +/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_sd&expand=2665) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_ss&expand=5750) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfmsub213sd))] -pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { - let mut fmsub: f64 = simd_extract(c, 0); +#[cfg_attr(test, assert_instr(vsubss))] +pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + let extractsrc: f32 = simd_extract(src, 0); + let mut add: f32 = extractsrc; if (k & 0b00000001) != 0 { - let extracta: f64 = simd_extract(a, 0); - let extractb: f64 = simd_extract(b, 0); - let extractc = -fmsub; - fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + add = extracta - extractb; } - let r = simd_insert(c, 0, fmsub); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_ss&expand=2748) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_ss&expand=5751) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfnmadd213ss))] -pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { - let mut fnmadd: f32 = simd_extract(a, 0); +#[cfg_attr(test, assert_instr(vsubss))] +pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + let mut add: f32 = 0.; if (k & 0b00000001) != 0 { - let extracta = -fnmadd; + let extracta: f32 = simd_extract(a, 0); let extractb: f32 = simd_extract(b, 0); - let extractc: f32 = simd_extract(c, 0); - fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + add = extracta - extractb; } - let r = simd_insert(a, 0, fnmadd); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_ss&expand=2750) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_sd&expand=5746) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfnmadd213ss))] -pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { - let mut fnmadd: f32 = 0.; +#[cfg_attr(test, assert_instr(vsubsd))] +pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let extractsrc: f64 = simd_extract(src, 0); + let mut add: f64 = extractsrc; if (k & 0b00000001) != 0 { - let extracta: f32 = simd_extract(a, 0); - let extracta = -extracta; - let extractb: f32 = simd_extract(b, 0); - let extractc: f32 = simd_extract(c, 0); - fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + add = extracta - extractb; } - let r = simd_insert(a, 0, fnmadd); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst. +/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_ss&expand=2749) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_sd&expand=5747) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfnmadd213ss))] -pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { - let mut fnmadd: f32 = simd_extract(c, 0); +#[cfg_attr(test, assert_instr(vsubsd))] +pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let mut add: f64 = 0.; if (k & 0b00000001) != 0 { - let extracta: f32 = simd_extract(a, 0); - let extracta = -extracta; - let extractb: f32 = simd_extract(b, 0); - fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION); + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + add = extracta - extractb; } - let r = simd_insert(c, 0, fnmadd); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_sd&expand=2744) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_ss&expand=3950) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfnmadd213sd))] -pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { - let mut fnmadd: f64 = simd_extract(a, 0); +#[cfg_attr(test, assert_instr(vmulss))] +pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + let extractsrc: f32 = simd_extract(src, 0); + let mut add: f32 = extractsrc; if (k & 0b00000001) != 0 { - let extracta = -fnmadd; - let extractb: f64 = simd_extract(b, 0); - let extractc: f64 = simd_extract(c, 0); - fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + add = extracta * extractb; } - let r = simd_insert(a, 0, fnmadd); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_sd&expand=2746) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_ss&expand=3951) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfnmadd213sd))] -pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { - let mut fnmadd: f64 = 0.; +#[cfg_attr(test, assert_instr(vmulss))] +pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + let mut add: f32 = 0.; if (k & 0b00000001) != 0 { - let extracta: f64 = simd_extract(a, 0); - let extracta = -extracta; - let extractb: f64 = simd_extract(b, 0); - let extractc: f64 = simd_extract(c, 0); - fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + add = extracta * extractb; } - let r = simd_insert(a, 0, fnmadd); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst. +/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_sd&expand=2745) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_sd&expand=3947) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfnmadd213sd))] -pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { - let mut fnmadd: f64 = simd_extract(c, 0); +#[cfg_attr(test, assert_instr(vmulsd))] +pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let extractsrc: f64 = simd_extract(src, 0); + let mut add: f64 = extractsrc; if (k & 0b00000001) != 0 { let extracta: f64 = simd_extract(a, 0); - let extracta = -extracta; let extractb: f64 = simd_extract(b, 0); - fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION); + add = extracta * extractb; } - let r = simd_insert(c, 0, fnmadd); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_ss&expand=2796) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_sd&expand=3948) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfnmsub213ss))] -pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { - let mut fnmsub: f32 = simd_extract(a, 0); +#[cfg_attr(test, assert_instr(vmulsd))] +pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let mut add: f64 = 0.; if (k & 0b00000001) != 0 { - let extracta = -fnmsub; - let extractb: f32 = simd_extract(b, 0); - let extractc: f32 = simd_extract(c, 0); - let extractc = -extractc; - fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + add = extracta * extractb; } - let r = simd_insert(a, 0, fnmsub); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_ss&expand=2798) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_ss&expand=2181) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfnmsub213ss))] -pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { - let mut fnmsub: f32 = 0.; +#[cfg_attr(test, assert_instr(vdivss))] +pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + let extractsrc: f32 = simd_extract(src, 0); + let mut add: f32 = extractsrc; if (k & 0b00000001) != 0 { let extracta: f32 = simd_extract(a, 0); - let extracta = -extracta; let extractb: f32 = simd_extract(b, 0); - let extractc: f32 = simd_extract(c, 0); - let extractc = -extractc; - fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + add = extracta / extractb; } - let r = simd_insert(a, 0, fnmsub); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst. +/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_ss&expand=2797) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_ss&expand=2182) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfnmsub213ss))] -pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { - let mut fnmsub: f32 = simd_extract(c, 0); +#[cfg_attr(test, assert_instr(vdivss))] +pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + let mut add: f32 = 0.; if (k & 0b00000001) != 0 { let extracta: f32 = simd_extract(a, 0); - let extracta = -extracta; let extractb: f32 = simd_extract(b, 0); - let extractc = -fnmsub; - fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + add = extracta / extractb; } - let r = simd_insert(c, 0, fnmsub); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_sd&expand=2792) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_sd&expand=2178) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfnmsub213sd))] -pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { - let mut fnmsub: f64 = simd_extract(a, 0); +#[cfg_attr(test, assert_instr(vdivsd))] +pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let extractsrc: f64 = simd_extract(src, 0); + let mut add: f64 = extractsrc; if (k & 0b00000001) != 0 { - let extracta = -fnmsub; + let extracta: f64 = simd_extract(a, 0); let extractb: f64 = simd_extract(b, 0); - let extractc: f64 = simd_extract(c, 0); - let extractc = -extractc; - fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + add = extracta / extractb; } - let r = simd_insert(a, 0, fnmsub); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_sd&expand=2794) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_sd&expand=2179) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfnmsub213sd))] -pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { - let mut fnmsub: f64 = 0.; +#[cfg_attr(test, assert_instr(vdivsd))] +pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let mut add: f64 = 0.; if (k & 0b00000001) != 0 { let extracta: f64 = simd_extract(a, 0); - let extracta = -extracta; let extractb: f64 = simd_extract(b, 0); - let extractc: f64 = simd_extract(c, 0); - let extractc = -extractc; - fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + add = extracta / extractb; } - let r = simd_insert(a, 0, fnmsub); + let r = simd_insert(a, 0, add); transmute(r) } -/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst. +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_sd&expand=2793) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vfnmsub213sd))] -pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { - let mut fnmsub: f64 = simd_extract(c, 0); - if (k & 0b00000001) != 0 { - let extracta: f64 = simd_extract(a, 0); - let extracta = -extracta; - let extractb: f64 = simd_extract(b, 0); - let extractc = -fnmsub; - fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); - } - let r = simd_insert(c, 0, fnmsub); - transmute(r) +#[cfg_attr(test, assert_instr(vmaxss))] +pub unsafe fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vmaxss( + a.as_f32x4(), + b.as_f32x4(), + src.as_f32x4(), + k, + _MM_FROUND_CUR_DIRECTION, + )) } -/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_ss&expand=3673) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128) -> __m128 { - static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vaddss(a, b, zero, 0b1, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vmaxss))] +pub unsafe fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vmaxss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + _MM_FROUND_CUR_DIRECTION, + )) } -/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_sd&expand=3669) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_add_round_ss( - src: __m128, - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let src = src.as_f32x4(); - let r = vaddss(a, b, src, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vmaxsd))] +pub unsafe fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vmaxsd( + a.as_f64x2(), + b.as_f64x2(), + src.as_f64x2(), + k, + _MM_FROUND_CUR_DIRECTION, + )) } -/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_add_round_ss( - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vaddss(a, b, zero, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vmaxsd))] +pub unsafe fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vmaxsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + _MM_FROUND_CUR_DIRECTION, + )) } -/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_ss&expand=3786) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d) -> __m128d { - static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vaddsd(a, b, zero, 0b1, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vminss))] +pub unsafe fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vminss( + a.as_f32x4(), + b.as_f32x4(), + src.as_f32x4(), + k, + _MM_FROUND_CUR_DIRECTION, + )) } -/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_ss&expand=3787) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_add_round_sd( - src: __m128d, - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let src = src.as_f64x2(); - let r = vaddsd(a, b, src, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vminss))] +pub unsafe fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vminss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + _MM_FROUND_CUR_DIRECTION, + )) } -/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_sd&expand=3783) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_add_round_sd( - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vaddsd(a, b, zero, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vminsd))] +pub unsafe fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vminsd( + a.as_f64x2(), + b.as_f64x2(), + src.as_f64x2(), + k, + _MM_FROUND_CUR_DIRECTION, + )) } -/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_sd&expand=3784) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128) -> __m128 { - static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vsubss(a, b, zero, 0b1, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vminsd))] +pub unsafe fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vminsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + _MM_FROUND_CUR_DIRECTION, + )) } -/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_ss&expand=5387) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_sub_round_ss( - src: __m128, - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let src = src.as_f32x4(); - let r = vsubss(a, b, src, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vsqrtss))] +pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vsqrtss( + a.as_f32x4(), + b.as_f32x4(), + src.as_f32x4(), + k, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + )) } -/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_ss&expand=5388) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_sub_round_ss( - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vsubss(a, b, zero, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vsqrtss))] +pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vsqrtss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + )) } -/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_sd&expand=5384) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d) -> __m128d { - static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vsubsd(a, b, zero, 0b1, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vsqrtsd))] +pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vsqrtsd( + a.as_f64x2(), + b.as_f64x2(), + src.as_f64x2(), + k, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + )) } -/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_sd&expand=5385) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_sub_round_sd( - src: __m128d, - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let src = src.as_f64x2(); - let r = vsubsd(a, b, src, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vsqrtsd))] +pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vsqrtsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + )) } -/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_ss&expand=4825) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_sub_round_sd( - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vsubsd(a, b, zero, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vrsqrt14ss))] +pub unsafe fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 { + transmute(vrsqrt14ss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + )) } -/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_ss&expand=4823) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128) -> __m128 { - static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vmulss(a, b, zero, 0b1, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vrsqrt14ss))] +pub unsafe fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) } -/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_ss&expand=4824) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_mul_round_ss( - src: __m128, - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let src = src.as_f32x4(); - let r = vmulss(a, b, src, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vrsqrt14ss))] +pub unsafe fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vrsqrt14ss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + )) } -/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_sd&expand=4822) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_mul_round_ss( - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vmulss(a, b, zero, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vrsqrt14sd))] +pub unsafe fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d { + transmute(vrsqrt14sd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + )) } -/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_sd&expand=4820) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d) -> __m128d { - static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vmulsd(a, b, zero, 0b1, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vrsqrt14sd))] +pub unsafe fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) } -/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_sd&expand=4821) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_mul_round_sd( - src: __m128d, - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let src = src.as_f64x2(); - let r = vmulsd(a, b, src, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vrsqrt14sd))] +pub unsafe fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vrsqrt14sd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + )) } -/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_ss&expand=4508) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_mul_round_sd( - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vmulsd(a, b, zero, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vrcp14ss))] +pub unsafe fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 { + transmute(vrcp14ss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + )) } -/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_ss&expand=4506) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128) -> __m128 { - static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vdivss(a, b, zero, 0b1, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vrcp14ss))] +pub unsafe fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) } -/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_ss&expand=4507) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_div_round_ss( - src: __m128, - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let src = src.as_f32x4(); - let r = vdivss(a, b, src, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vrcp14ss))] +pub unsafe fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vrcp14ss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + )) } -/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_sd&expand=4505) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_div_round_ss( - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_rounding!(ROUNDING); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vdivss(a, b, zero, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vrcp14sd))] +pub unsafe fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d { + transmute(vrcp14sd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + )) } -/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_sd&expand=4503) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d) -> __m128d { - static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vdivsd(a, b, zero, 0b1, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vrcp14sd))] +pub unsafe fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) } -/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_sd&expand=4504) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_div_round_sd( - src: __m128d, - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let src = src.as_f64x2(); - let r = vdivsd(a, b, src, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vrcp14sd))] +pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vrcp14sd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + )) } -/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_ss&expand=2862) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_div_round_sd( - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_rounding!(ROUNDING); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vdivsd(a, b, zero, k, ROUNDING); - transmute(r) +#[cfg_attr(test, assert_instr(vgetexpss))] +pub unsafe fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 { + transmute(vgetexpss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + _MM_FROUND_NO_EXC, + )) } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_ss&expand=2863) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128) -> __m128 { - static_assert_sae!(SAE); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vmaxss(a, b, zero, 0b1, SAE); - transmute(r) +#[cfg_attr(test, assert_instr(vgetexpss))] +pub unsafe fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vgetexpss( + a.as_f32x4(), + b.as_f32x4(), + src.as_f32x4(), + k, + _MM_FROUND_NO_EXC, + )) } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_ss&expand=2864) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_max_round_ss( - src: __m128, - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_sae!(SAE); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let src = src.as_f32x4(); - let r = vmaxss(a, b, src, k, SAE); - transmute(r) +#[cfg_attr(test, assert_instr(vgetexpss))] +pub unsafe fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vgetexpss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + _MM_FROUND_NO_EXC, + )) } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_sd&expand=2859) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - static_assert_sae!(SAE); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vmaxss(a, b, zero, k, SAE); - transmute(r) +#[cfg_attr(test, assert_instr(vgetexpsd))] +pub unsafe fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d { + transmute(vgetexpsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + _MM_FROUND_NO_EXC, + )) } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_sd&expand=2860) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d) -> __m128d { - static_assert_sae!(SAE); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vmaxsd(a, b, zero, 0b1, SAE); - transmute(r) -} - -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663) -#[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_max_round_sd( - src: __m128d, - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_sae!(SAE); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let src = src.as_f64x2(); - let r = vmaxsd(a, b, src, k, SAE); - transmute(r) +#[cfg_attr(test, assert_instr(vgetexpsd))] +pub unsafe fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vgetexpsd( + a.as_f64x2(), + b.as_f64x2(), + src.as_f64x2(), + k, + _MM_FROUND_NO_EXC, + )) } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_sd&expand=2861) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_max_round_sd( - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_sae!(SAE); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vmaxsd(a, b, zero, k, SAE); - transmute(r) +#[cfg_attr(test, assert_instr(vgetexpsd))] +pub unsafe fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vgetexpsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + _MM_FROUND_NO_EXC, + )) } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\ +/// The mantissa is normalized to the interval specified by interv, which can take the following values:\ +/// _MM_MANT_NORM_1_2 // interval [1, 2)\ +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)\ +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)\ +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\ +/// The sign is determined by sc which can take the following values:\ +/// _MM_MANT_SIGN_src // sign = sign(src)\ +/// _MM_MANT_SIGN_zero // sign = 0\ +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1\ /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_ss&expand=2898) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminss, SAE = 8))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128) -> __m128 { - static_assert_sae!(SAE); +#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))] +#[rustc_legacy_const_generics(2, 3)] +pub unsafe fn _mm_getmant_ss< + const NORM: _MM_MANTISSA_NORM_ENUM, + const SIGN: _MM_MANTISSA_SIGN_ENUM, +>( + a: __m128, + b: __m128, +) -> __m128 { + static_assert_imm4!(NORM); + static_assert_imm2!(SIGN); let a = a.as_f32x4(); let b = b.as_f32x4(); let zero = _mm_setzero_ps().as_f32x4(); - let r = vminss(a, b, zero, 0b1, SAE); + let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, 0b1, _MM_FROUND_CUR_DIRECTION); transmute(r) } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\ +/// The mantissa is normalized to the interval specified by interv, which can take the following values:\ +/// _MM_MANT_NORM_1_2 // interval [1, 2)\ +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)\ +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)\ +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\ +/// The sign is determined by sc which can take the following values:\ +/// _MM_MANT_SIGN_src // sign = sign(src)\ +/// _MM_MANT_SIGN_zero // sign = 0\ +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1\ /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_ss&expand=2899) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminss, SAE = 8))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_min_round_ss( +#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))] +#[rustc_legacy_const_generics(4, 5)] +pub unsafe fn _mm_mask_getmant_ss< + const NORM: _MM_MANTISSA_NORM_ENUM, + const SIGN: _MM_MANTISSA_SIGN_ENUM, +>( src: __m128, k: __mmask8, a: __m128, b: __m128, ) -> __m128 { - static_assert_sae!(SAE); + static_assert_imm4!(NORM); + static_assert_imm2!(SIGN); let a = a.as_f32x4(); let b = b.as_f32x4(); let src = src.as_f32x4(); - let r = vminss(a, b, src, k, SAE); + let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION); transmute(r) } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\ +/// The mantissa is normalized to the interval specified by interv, which can take the following values:\ +/// _MM_MANT_NORM_1_2 // interval [1, 2)\ +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)\ +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)\ +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\ +/// The sign is determined by sc which can take the following values:\ +/// _MM_MANT_SIGN_src // sign = sign(src)\ +/// _MM_MANT_SIGN_zero // sign = 0\ +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1\ /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_ss&expand=2900) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminss, SAE = 8))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { - static_assert_sae!(SAE); +#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))] +#[rustc_legacy_const_generics(3, 4)] +pub unsafe fn _mm_maskz_getmant_ss< + const NORM: _MM_MANTISSA_NORM_ENUM, + const SIGN: _MM_MANTISSA_SIGN_ENUM, +>( + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_imm4!(NORM); + static_assert_imm2!(SIGN); let a = a.as_f32x4(); let b = b.as_f32x4(); let zero = _mm_setzero_ps().as_f32x4(); - let r = vminss(a, b, zero, k, SAE); + let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION); transmute(r) } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.\ +/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\ +/// The mantissa is normalized to the interval specified by interv, which can take the following values:\ +/// _MM_MANT_NORM_1_2 // interval [1, 2)\ +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)\ +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)\ +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\ +/// The sign is determined by sc which can take the following values:\ +/// _MM_MANT_SIGN_src // sign = sign(src)\ +/// _MM_MANT_SIGN_zero // sign = 0\ +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1\ /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_sd&expand=2895) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminsd, SAE = 8))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d) -> __m128d { - static_assert_sae!(SAE); +#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))] +#[rustc_legacy_const_generics(2, 3)] +pub unsafe fn _mm_getmant_sd< + const NORM: _MM_MANTISSA_NORM_ENUM, + const SIGN: _MM_MANTISSA_SIGN_ENUM, +>( + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_imm4!(NORM); + static_assert_imm2!(SIGN); let a = a.as_f64x2(); let b = b.as_f64x2(); let zero = _mm_setzero_pd().as_f64x2(); - let r = vminsd(a, b, zero, 0b1, SAE); + let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, 0b1, _MM_FROUND_CUR_DIRECTION); transmute(r) } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\ +/// The mantissa is normalized to the interval specified by interv, which can take the following values:\ +/// _MM_MANT_NORM_1_2 // interval [1, 2)\ +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)\ +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)\ +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\ +/// The sign is determined by sc which can take the following values:\ +/// _MM_MANT_SIGN_src // sign = sign(src)\ +/// _MM_MANT_SIGN_zero // sign = 0\ +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1\ /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_sd&expand=2896) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminsd, SAE = 8))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_min_round_sd( +#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))] +#[rustc_legacy_const_generics(4, 5)] +pub unsafe fn _mm_mask_getmant_sd< + const NORM: _MM_MANTISSA_NORM_ENUM, + const SIGN: _MM_MANTISSA_SIGN_ENUM, +>( src: __m128d, k: __mmask8, a: __m128d, b: __m128d, ) -> __m128d { - static_assert_sae!(SAE); + static_assert_imm4!(NORM); + static_assert_imm2!(SIGN); let a = a.as_f64x2(); let b = b.as_f64x2(); let src = src.as_f64x2(); - let r = vminsd(a, b, src, k, SAE); + let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION); transmute(r) } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\ +/// The mantissa is normalized to the interval specified by interv, which can take the following values:\ +/// _MM_MANT_NORM_1_2 // interval [1, 2)\ +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)\ +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)\ +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\ +/// The sign is determined by sc which can take the following values:\ +/// _MM_MANT_SIGN_src // sign = sign(src)\ +/// _MM_MANT_SIGN_zero // sign = 0\ +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1\ /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_sd&expand=2897) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminsd, SAE = 8))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_min_round_sd( +#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))] +#[rustc_legacy_const_generics(3, 4)] +pub unsafe fn _mm_maskz_getmant_sd< + const NORM: _MM_MANTISSA_NORM_ENUM, + const SIGN: _MM_MANTISSA_SIGN_ENUM, +>( k: __mmask8, a: __m128d, b: __m128d, ) -> __m128d { - static_assert_sae!(SAE); + static_assert_imm4!(NORM); + static_assert_imm2!(SIGN); let a = a.as_f64x2(); let b = b.as_f64x2(); let zero = _mm_setzero_pd().as_f64x2(); - let r = vminsd(a, b, zero, k, SAE); + let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION); transmute(r) } -/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\ +/// _MM_FROUND_TO_NEAREST_INT // round to nearest\ +/// _MM_FROUND_TO_NEG_INF // round down\ +/// _MM_FROUND_TO_POS_INF // round up\ +/// _MM_FROUND_TO_ZERO // truncate\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_ss&expand=4802) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 255))] #[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128) -> __m128 { - static_assert_rounding!(ROUNDING); +pub unsafe fn _mm_roundscale_ss(a: __m128, b: __m128) -> __m128 { + static_assert_imm8!(IMM8); let a = a.as_f32x4(); let b = b.as_f32x4(); let zero = _mm_setzero_ps().as_f32x4(); - let r = vsqrtss(a, b, zero, 0b1, ROUNDING); + let r = vrndscaless(a, b, zero, 0b11111111, IMM8, _MM_FROUND_CUR_DIRECTION); transmute(r) } -/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\ +/// _MM_FROUND_TO_NEAREST_INT // round to nearest\ +/// _MM_FROUND_TO_NEG_INF // round down\ +/// _MM_FROUND_TO_POS_INF // round up\ +/// _MM_FROUND_TO_ZERO // truncate\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_ss&expand=4800) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))] #[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_sqrt_round_ss( +pub unsafe fn _mm_mask_roundscale_ss( src: __m128, k: __mmask8, a: __m128, b: __m128, ) -> __m128 { - static_assert_rounding!(ROUNDING); + static_assert_imm8!(IMM8); let a = a.as_f32x4(); let b = b.as_f32x4(); let src = src.as_f32x4(); - let r = vsqrtss(a, b, src, k, ROUNDING); + let r = vrndscaless(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION); transmute(r) } -/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\ +/// _MM_FROUND_TO_NEAREST_INT // round to nearest\ +/// _MM_FROUND_TO_NEG_INF // round down\ +/// _MM_FROUND_TO_POS_INF // round up\ +/// _MM_FROUND_TO_ZERO // truncate\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_ss&expand=4801) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))] #[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_sqrt_round_ss( +pub unsafe fn _mm_maskz_roundscale_ss( k: __mmask8, a: __m128, b: __m128, ) -> __m128 { - static_assert_rounding!(ROUNDING); + static_assert_imm8!(IMM8); let a = a.as_f32x4(); let b = b.as_f32x4(); let zero = _mm_setzero_ps().as_f32x4(); - let r = vsqrtss(a, b, zero, k, ROUNDING); + let r = vrndscaless(a, b, zero, k, IMM8, _MM_FROUND_CUR_DIRECTION); transmute(r) } -/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ +/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\ +/// _MM_FROUND_TO_NEAREST_INT // round to nearest\ +/// _MM_FROUND_TO_NEG_INF // round down\ +/// _MM_FROUND_TO_POS_INF // round up\ +/// _MM_FROUND_TO_ZERO // truncate\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_sd&expand=4799) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 255))] #[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d) -> __m128d { - static_assert_rounding!(ROUNDING); +pub unsafe fn _mm_roundscale_sd(a: __m128d, b: __m128d) -> __m128d { + static_assert_imm8!(IMM8); let a = a.as_f64x2(); let b = b.as_f64x2(); let zero = _mm_setzero_pd().as_f64x2(); - let r = vsqrtsd(a, b, zero, 0b1, ROUNDING); + let r = vrndscalesd(a, b, zero, 0b11111111, IMM8, _MM_FROUND_CUR_DIRECTION); transmute(r) } -/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\ +/// _MM_FROUND_TO_NEAREST_INT // round to nearest\ +/// _MM_FROUND_TO_NEG_INF // round down\ +/// _MM_FROUND_TO_POS_INF // round up\ +/// _MM_FROUND_TO_ZERO // truncate\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_sd&expand=4797) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))] #[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_sqrt_round_sd( +pub unsafe fn _mm_mask_roundscale_sd( src: __m128d, k: __mmask8, a: __m128d, b: __m128d, ) -> __m128d { - static_assert_rounding!(ROUNDING); + static_assert_imm8!(IMM8); let a = a.as_f64x2(); let b = b.as_f64x2(); let src = src.as_f64x2(); - let r = vsqrtsd(a, b, src, k, ROUNDING); + let r = vrndscalesd(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION); transmute(r) } -/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\ +/// _MM_FROUND_TO_NEAREST_INT // round to nearest\ +/// _MM_FROUND_TO_NEG_INF // round down\ +/// _MM_FROUND_TO_POS_INF // round up\ +/// _MM_FROUND_TO_ZERO // truncate\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_sd&expand=4798) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))] +#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))] #[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_sqrt_round_sd( +pub unsafe fn _mm_maskz_roundscale_sd( k: __mmask8, a: __m128d, b: __m128d, ) -> __m128d { - static_assert_rounding!(ROUNDING); + static_assert_imm8!(IMM8); let a = a.as_f64x2(); let b = b.as_f64x2(); let zero = _mm_setzero_pd().as_f64x2(); - let r = vsqrtsd(a, b, zero, k, ROUNDING); + let r = vrndscalesd(a, b, zero, k, IMM8, _MM_FROUND_CUR_DIRECTION); transmute(r) } -/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_ss&expand=2856) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_ss&expand=4901) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_getexp_round_ss(a: __m128, b: __m128) -> __m128 { - static_assert_sae!(SAE); +#[cfg_attr(test, assert_instr(vscalefss))] +pub unsafe fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 { let a = a.as_f32x4(); let b = b.as_f32x4(); let zero = _mm_setzero_ps().as_f32x4(); - let r = vgetexpss(a, b, zero, 0b1, SAE); - transmute(r) + transmute(vscalefss(a, b, zero, 0b11111111, _MM_FROUND_CUR_DIRECTION)) } -/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_ss&expand=2857) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_ss&expand=4899) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_getexp_round_ss( - src: __m128, - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_sae!(SAE); +#[cfg_attr(test, assert_instr(vscalefss))] +pub unsafe fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { let a = a.as_f32x4(); let b = b.as_f32x4(); let src = src.as_f32x4(); - let r = vgetexpss(a, b, src, k, SAE); - transmute(r) + transmute(vscalefss(a, b, src, k, _MM_FROUND_CUR_DIRECTION)) } -/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_ss&expand=2858) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_ss&expand=4900) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_getexp_round_ss( - k: __mmask8, - a: __m128, - b: __m128, -) -> __m128 { - static_assert_sae!(SAE); - let a = a.as_f32x4(); - let b = b.as_f32x4(); - let zero = _mm_setzero_ps().as_f32x4(); - let r = vgetexpss(a, b, zero, k, SAE); - transmute(r) +#[cfg_attr(test, assert_instr(vscalefss))] +pub unsafe fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vscalefss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + _MM_FROUND_CUR_DIRECTION, + )) } -/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_sd&expand=2853) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_sd&expand=4898) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))] -#[rustc_legacy_const_generics(2)] -pub unsafe fn _mm_getexp_round_sd(a: __m128d, b: __m128d) -> __m128d { - static_assert_sae!(SAE); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let zero = _mm_setzero_pd().as_f64x2(); - let r = vgetexpsd(a, b, zero, 0b1, SAE); - transmute(r) +#[cfg_attr(test, assert_instr(vscalefsd))] +pub unsafe fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d { + transmute(vscalefsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b11111111, + _MM_FROUND_CUR_DIRECTION, + )) } -/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_sd&expand=2854) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_sd&expand=4896) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))] -#[rustc_legacy_const_generics(4)] -pub unsafe fn _mm_mask_getexp_round_sd( - src: __m128d, - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_sae!(SAE); - let a = a.as_f64x2(); - let b = b.as_f64x2(); - let src = src.as_f64x2(); - let r = vgetexpsd(a, b, src, k, SAE); - transmute(r) +#[cfg_attr(test, assert_instr(vscalefsd))] +pub unsafe fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vscalefsd( + a.as_f64x2(), + b.as_f64x2(), + src.as_f64x2(), + k, + _MM_FROUND_CUR_DIRECTION, + )) } -/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\ -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_sd&expand=2855) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_sd&expand=4897) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn _mm_maskz_getexp_round_sd( - k: __mmask8, - a: __m128d, - b: __m128d, -) -> __m128d { - static_assert_sae!(SAE); +#[cfg_attr(test, assert_instr(vscalefsd))] +pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vscalefsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_ss&expand=2582) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213ss))] +pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { + let mut fmadd: f32 = simd_extract(a, 0); + if (k & 0b00000001) != 0 { + let extractb: f32 = simd_extract(b, 0); + let extractc: f32 = simd_extract(c, 0); + fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fmadd); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_ss&expand=2584) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213ss))] +pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { + let mut fmadd: f32 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + let extractc: f32 = simd_extract(c, 0); + fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fmadd); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_ss&expand=2583) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213ss))] +pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { + let mut fmadd: f32 = simd_extract(c, 0); + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(c, 0, fmadd); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_sd&expand=2578) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213sd))] +pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { + let mut fmadd: f64 = simd_extract(a, 0); + if (k & 0b00000001) != 0 { + let extractb: f64 = simd_extract(b, 0); + let extractc: f64 = simd_extract(c, 0); + fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fmadd); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_sd&expand=2580) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213sd))] +pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { + let mut fmadd: f64 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + let extractc: f64 = simd_extract(c, 0); + fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fmadd); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_sd&expand=2579) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213sd))] +pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { + let mut fmadd: f64 = simd_extract(c, 0); + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(c, 0, fmadd); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_ss&expand=2668) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmsub213ss))] +pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { + let mut fmsub: f32 = simd_extract(a, 0); + if (k & 0b00000001) != 0 { + let extractb: f32 = simd_extract(b, 0); + let extractc: f32 = simd_extract(c, 0); + let extractc = -extractc; + fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fmsub); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_ss&expand=2670) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmsub213ss))] +pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { + let mut fmsub: f32 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + let extractc: f32 = simd_extract(c, 0); + let extractc = -extractc; + fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fmsub); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_ss&expand=2669) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmsub213ss))] +pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { + let mut fmsub: f32 = simd_extract(c, 0); + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + let extractc = -fmsub; + fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(c, 0, fmsub); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_sd&expand=2664) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmsub213sd))] +pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { + let mut fmsub: f64 = simd_extract(a, 0); + if (k & 0b00000001) != 0 { + let extractb: f64 = simd_extract(b, 0); + let extractc: f64 = simd_extract(c, 0); + let extractc = -extractc; + fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fmsub); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_sd&expand=2666) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmsub213sd))] +pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { + let mut fmsub: f64 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + let extractc: f64 = simd_extract(c, 0); + let extractc = -extractc; + fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fmsub); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_sd&expand=2665) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmsub213sd))] +pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { + let mut fmsub: f64 = simd_extract(c, 0); + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + let extractc = -fmsub; + fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(c, 0, fmsub); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_ss&expand=2748) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfnmadd213ss))] +pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { + let mut fnmadd: f32 = simd_extract(a, 0); + if (k & 0b00000001) != 0 { + let extracta = -fnmadd; + let extractb: f32 = simd_extract(b, 0); + let extractc: f32 = simd_extract(c, 0); + fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fnmadd); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_ss&expand=2750) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfnmadd213ss))] +pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { + let mut fnmadd: f32 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extracta = -extracta; + let extractb: f32 = simd_extract(b, 0); + let extractc: f32 = simd_extract(c, 0); + fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fnmadd); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_ss&expand=2749) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfnmadd213ss))] +pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { + let mut fnmadd: f32 = simd_extract(c, 0); + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extracta = -extracta; + let extractb: f32 = simd_extract(b, 0); + fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(c, 0, fnmadd); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_sd&expand=2744) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfnmadd213sd))] +pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { + let mut fnmadd: f64 = simd_extract(a, 0); + if (k & 0b00000001) != 0 { + let extracta = -fnmadd; + let extractb: f64 = simd_extract(b, 0); + let extractc: f64 = simd_extract(c, 0); + fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fnmadd); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_sd&expand=2746) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfnmadd213sd))] +pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { + let mut fnmadd: f64 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extracta = -extracta; + let extractb: f64 = simd_extract(b, 0); + let extractc: f64 = simd_extract(c, 0); + fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fnmadd); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_sd&expand=2745) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfnmadd213sd))] +pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { + let mut fnmadd: f64 = simd_extract(c, 0); + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extracta = -extracta; + let extractb: f64 = simd_extract(b, 0); + fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(c, 0, fnmadd); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_ss&expand=2796) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfnmsub213ss))] +pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { + let mut fnmsub: f32 = simd_extract(a, 0); + if (k & 0b00000001) != 0 { + let extracta = -fnmsub; + let extractb: f32 = simd_extract(b, 0); + let extractc: f32 = simd_extract(c, 0); + let extractc = -extractc; + fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fnmsub); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_ss&expand=2798) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfnmsub213ss))] +pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { + let mut fnmsub: f32 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extracta = -extracta; + let extractb: f32 = simd_extract(b, 0); + let extractc: f32 = simd_extract(c, 0); + let extractc = -extractc; + fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fnmsub); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_ss&expand=2797) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfnmsub213ss))] +pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { + let mut fnmsub: f32 = simd_extract(c, 0); + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extracta = -extracta; + let extractb: f32 = simd_extract(b, 0); + let extractc = -fnmsub; + fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(c, 0, fnmsub); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_sd&expand=2792) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfnmsub213sd))] +pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { + let mut fnmsub: f64 = simd_extract(a, 0); + if (k & 0b00000001) != 0 { + let extracta = -fnmsub; + let extractb: f64 = simd_extract(b, 0); + let extractc: f64 = simd_extract(c, 0); + let extractc = -extractc; + fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fnmsub); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_sd&expand=2794) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfnmsub213sd))] +pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { + let mut fnmsub: f64 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extracta = -extracta; + let extractb: f64 = simd_extract(b, 0); + let extractc: f64 = simd_extract(c, 0); + let extractc = -extractc; + fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fnmsub); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_sd&expand=2793) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfnmsub213sd))] +pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { + let mut fnmsub: f64 = simd_extract(c, 0); + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extracta = -extracta; + let extractb: f64 = simd_extract(b, 0); + let extractc = -fnmsub; + fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(c, 0, fnmsub); + transmute(r) +} + +/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vaddss(a, b, zero, 0b1, ROUNDING); + transmute(r) +} + +/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_add_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let src = src.as_f32x4(); + let r = vaddss(a, b, src, k, ROUNDING); + transmute(r) +} + +/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_add_round_ss( + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vaddss(a, b, zero, k, ROUNDING); + transmute(r) +} + +/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vaddsd(a, b, zero, 0b1, ROUNDING); + transmute(r) +} + +/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_add_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let src = src.as_f64x2(); + let r = vaddsd(a, b, src, k, ROUNDING); + transmute(r) +} + +/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_add_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vaddsd(a, b, zero, k, ROUNDING); + transmute(r) +} + +/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vsubss(a, b, zero, 0b1, ROUNDING); + transmute(r) +} + +/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_sub_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let src = src.as_f32x4(); + let r = vsubss(a, b, src, k, ROUNDING); + transmute(r) +} + +/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_sub_round_ss( + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vsubss(a, b, zero, k, ROUNDING); + transmute(r) +} + +/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vsubsd(a, b, zero, 0b1, ROUNDING); + transmute(r) +} + +/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_sub_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let src = src.as_f64x2(); + let r = vsubsd(a, b, src, k, ROUNDING); + transmute(r) +} + +/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_sub_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vsubsd(a, b, zero, k, ROUNDING); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vmulss(a, b, zero, 0b1, ROUNDING); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_mul_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let src = src.as_f32x4(); + let r = vmulss(a, b, src, k, ROUNDING); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_mul_round_ss( + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vmulss(a, b, zero, k, ROUNDING); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vmulsd(a, b, zero, 0b1, ROUNDING); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_mul_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let src = src.as_f64x2(); + let r = vmulsd(a, b, src, k, ROUNDING); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_mul_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vmulsd(a, b, zero, k, ROUNDING); + transmute(r) +} + +/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vdivss(a, b, zero, 0b1, ROUNDING); + transmute(r) +} + +/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_div_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let src = src.as_f32x4(); + let r = vdivss(a, b, src, k, ROUNDING); + transmute(r) +} + +/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_div_round_ss( + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vdivss(a, b, zero, k, ROUNDING); + transmute(r) +} + +/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vdivsd(a, b, zero, 0b1, ROUNDING); + transmute(r) +} + +/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_div_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let src = src.as_f64x2(); + let r = vdivsd(a, b, src, k, ROUNDING); + transmute(r) +} + +/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_div_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vdivsd(a, b, zero, k, ROUNDING); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128) -> __m128 { + static_assert_sae!(SAE); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vmaxss(a, b, zero, 0b1, SAE); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_max_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_sae!(SAE); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let src = src.as_f32x4(); + let r = vmaxss(a, b, src, k, SAE); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + static_assert_sae!(SAE); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vmaxss(a, b, zero, k, SAE); + transmute(r) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d) -> __m128d { + static_assert_sae!(SAE); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vmaxsd(a, b, zero, 0b1, SAE); + transmute(r) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_max_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_sae!(SAE); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let src = src.as_f64x2(); + let r = vmaxsd(a, b, src, k, SAE); + transmute(r) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_max_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_sae!(SAE); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vmaxsd(a, b, zero, k, SAE); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminss, SAE = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128) -> __m128 { + static_assert_sae!(SAE); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vminss(a, b, zero, 0b1, SAE); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminss, SAE = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_min_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_sae!(SAE); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let src = src.as_f32x4(); + let r = vminss(a, b, src, k, SAE); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminss, SAE = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + static_assert_sae!(SAE); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vminss(a, b, zero, k, SAE); + transmute(r) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminsd, SAE = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d) -> __m128d { + static_assert_sae!(SAE); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vminsd(a, b, zero, 0b1, SAE); + transmute(r) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminsd, SAE = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_min_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_sae!(SAE); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let src = src.as_f64x2(); + let r = vminsd(a, b, src, k, SAE); + transmute(r) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminsd, SAE = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_min_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_sae!(SAE); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vminsd(a, b, zero, k, SAE); + transmute(r) +} + +/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vsqrtss(a, b, zero, 0b1, ROUNDING); + transmute(r) +} + +/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_sqrt_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let src = src.as_f32x4(); + let r = vsqrtss(a, b, src, k, ROUNDING); + transmute(r) +} + +/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_sqrt_round_ss( + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_rounding!(ROUNDING); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vsqrtss(a, b, zero, k, ROUNDING); + transmute(r) +} + +/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vsqrtsd(a, b, zero, 0b1, ROUNDING); + transmute(r) +} + +/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_sqrt_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let src = src.as_f64x2(); + let r = vsqrtsd(a, b, src, k, ROUNDING); + transmute(r) +} + +/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\ +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\ +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions\ +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions\ +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions\ +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_sqrt_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_rounding!(ROUNDING); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vsqrtsd(a, b, zero, k, ROUNDING); + transmute(r) +} + +/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_ss&expand=2856) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_getexp_round_ss(a: __m128, b: __m128) -> __m128 { + static_assert_sae!(SAE); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vgetexpss(a, b, zero, 0b1, SAE); + transmute(r) +} + +/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_ss&expand=2857) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_getexp_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_sae!(SAE); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let src = src.as_f32x4(); + let r = vgetexpss(a, b, src, k, SAE); + transmute(r) +} + +/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_ss&expand=2858) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_getexp_round_ss( + k: __mmask8, + a: __m128, + b: __m128, +) -> __m128 { + static_assert_sae!(SAE); + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let zero = _mm_setzero_ps().as_f32x4(); + let r = vgetexpss(a, b, zero, k, SAE); + transmute(r) +} + +/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_sd&expand=2853) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn _mm_getexp_round_sd(a: __m128d, b: __m128d) -> __m128d { + static_assert_sae!(SAE); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let zero = _mm_setzero_pd().as_f64x2(); + let r = vgetexpsd(a, b, zero, 0b1, SAE); + transmute(r) +} + +/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_sd&expand=2854) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))] +#[rustc_legacy_const_generics(4)] +pub unsafe fn _mm_mask_getexp_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_sae!(SAE); + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let src = src.as_f64x2(); + let r = vgetexpsd(a, b, src, k, SAE); + transmute(r) +} + +/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\ +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_sd&expand=2855) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn _mm_maskz_getexp_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, +) -> __m128d { + static_assert_sae!(SAE); let a = a.as_f64x2(); let b = b.as_f64x2(); let zero = _mm_setzero_pd().as_f64x2(); @@ -41325,3266 +42621,4231 @@ mod tests { let src = _mm_set1_epi32(0); let r = _mm_mask_cvtps_epu32(src, 0, a); assert_eq_m128i(r, src); - let r = _mm_mask_cvtps_epu32(src, 0b00001111, a); - let e = _mm_set_epi32(12, 14, 14, 16); + let r = _mm_mask_cvtps_epu32(src, 0b00001111, a); + let e = _mm_set_epi32(12, 14, 14, 16); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_cvtps_epu32() { + let a = _mm_set_ps(12., 13.5, 14., 15.5); + let r = _mm_maskz_cvtps_epu32(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_cvtps_epu32(0b00001111, a); + let e = _mm_set_epi32(12, 14, 14, 16); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvtepi8_epi32() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_cvtepi8_epi32(a); + let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvtepi8_epi32() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let src = _mm512_set1_epi32(-1); + let r = _mm512_mask_cvtepi8_epi32(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_cvtepi8_epi32(src, 0b00000000_11111111, a); + let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvtepi8_epi32() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_maskz_cvtepi8_epi32(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_cvtepi8_epi32(0b00000000_11111111, a); + let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cvtepi8_epi32() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let src = _mm256_set1_epi32(-1); + let r = _mm256_mask_cvtepi8_epi32(src, 0, a); + assert_eq_m256i(r, src); + let r = _mm256_mask_cvtepi8_epi32(src, 0b11111111, a); + let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_cvtepi8_epi32() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm256_maskz_cvtepi8_epi32(0, a); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_cvtepi8_epi32(0b11111111, a); + let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cvtepi8_epi32() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let src = _mm_set1_epi32(-1); + let r = _mm_mask_cvtepi8_epi32(src, 0, a); + assert_eq_m128i(r, src); + let r = _mm_mask_cvtepi8_epi32(src, 0b00001111, a); + let e = _mm_set_epi32(12, 13, 14, 15); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_cvtepi8_epi32() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm_maskz_cvtepi8_epi32(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_cvtepi8_epi32(0b00001111, a); + let e = _mm_set_epi32(12, 13, 14, 15); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvtepu8_epi32() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_cvtepu8_epi32(a); + let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvtepu8_epi32() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let src = _mm512_set1_epi32(-1); + let r = _mm512_mask_cvtepu8_epi32(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_cvtepu8_epi32(src, 0b00000000_11111111, a); + let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvtepu8_epi32() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_maskz_cvtepu8_epi32(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_cvtepu8_epi32(0b00000000_11111111, a); + let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cvtepu8_epi32() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let src = _mm256_set1_epi32(-1); + let r = _mm256_mask_cvtepu8_epi32(src, 0, a); + assert_eq_m256i(r, src); + let r = _mm256_mask_cvtepu8_epi32(src, 0b11111111, a); + let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_cvtepu8_epi32() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm256_maskz_cvtepu8_epi32(0, a); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_cvtepu8_epi32(0b11111111, a); + let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cvtepu8_epi32() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let src = _mm_set1_epi32(-1); + let r = _mm_mask_cvtepu8_epi32(src, 0, a); + assert_eq_m128i(r, src); + let r = _mm_mask_cvtepu8_epi32(src, 0b00001111, a); + let e = _mm_set_epi32(12, 13, 14, 15); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_cvtepu8_epi32() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm_maskz_cvtepu8_epi32(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_cvtepu8_epi32(0b00001111, a); + let e = _mm_set_epi32(12, 13, 14, 15); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvtepi16_epi32() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_cvtepi16_epi32(a); + let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvtepi16_epi32() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let src = _mm512_set1_epi32(-1); + let r = _mm512_mask_cvtepi16_epi32(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_cvtepi16_epi32(src, 0b00000000_11111111, a); + let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvtepi16_epi32() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_maskz_cvtepi16_epi32(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_cvtepi16_epi32(0b00000000_11111111, a); + let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cvtepi16_epi32() { + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let src = _mm256_set1_epi32(-1); + let r = _mm256_mask_cvtepi16_epi32(src, 0, a); + assert_eq_m256i(r, src); + let r = _mm256_mask_cvtepi16_epi32(src, 0b11111111, a); + let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_cvtepi16_epi32() { + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm256_maskz_cvtepi16_epi32(0, a); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_cvtepi16_epi32(0b11111111, a); + let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cvtepi16_epi32() { + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let src = _mm_set1_epi32(-1); + let r = _mm_mask_cvtepi16_epi32(src, 0, a); + assert_eq_m128i(r, src); + let r = _mm_mask_cvtepi16_epi32(src, 0b00001111, a); + let e = _mm_set_epi32(4, 5, 6, 7); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvtps_epu32() { - let a = _mm_set_ps(12., 13.5, 14., 15.5); - let r = _mm_maskz_cvtps_epu32(0, a); + unsafe fn test_mm_maskz_cvtepi16_epi32() { + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm_maskz_cvtepi16_epi32(0, a); assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvtps_epu32(0b00001111, a); - let e = _mm_set_epi32(12, 14, 14, 16); + let r = _mm_maskz_cvtepi16_epi32(0b00001111, a); + let e = _mm_set_epi32(4, 5, 6, 7); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtepi8_epi32() { - let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_cvtepi8_epi32(a); + unsafe fn test_mm512_cvtepu16_epi32() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_cvtepu16_epi32(a); let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtepi8_epi32() { - let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + unsafe fn test_mm512_mask_cvtepu16_epi32() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let src = _mm512_set1_epi32(-1); - let r = _mm512_mask_cvtepi8_epi32(src, 0, a); + let r = _mm512_mask_cvtepu16_epi32(src, 0, a); assert_eq_m512i(r, src); - let r = _mm512_mask_cvtepi8_epi32(src, 0b00000000_11111111, a); + let r = _mm512_mask_cvtepu16_epi32(src, 0b00000000_11111111, a); let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtepi8_epi32() { - let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_maskz_cvtepi8_epi32(0, a); + unsafe fn test_mm512_maskz_cvtepu16_epi32() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_maskz_cvtepu16_epi32(0, a); assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_cvtepi8_epi32(0b00000000_11111111, a); + let r = _mm512_maskz_cvtepu16_epi32(0b00000000_11111111, a); let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvtepi8_epi32() { - let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + unsafe fn test_mm256_mask_cvtepu16_epi32() { + let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15); let src = _mm256_set1_epi32(-1); - let r = _mm256_mask_cvtepi8_epi32(src, 0, a); + let r = _mm256_mask_cvtepu16_epi32(src, 0, a); assert_eq_m256i(r, src); - let r = _mm256_mask_cvtepi8_epi32(src, 0b11111111, a); + let r = _mm256_mask_cvtepu16_epi32(src, 0b11111111, a); let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvtepi8_epi32() { - let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm256_maskz_cvtepi8_epi32(0, a); + unsafe fn test_mm256_maskz_cvtepu16_epi32() { + let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm256_maskz_cvtepu16_epi32(0, a); assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm256_maskz_cvtepi8_epi32(0b11111111, a); + let r = _mm256_maskz_cvtepu16_epi32(0b11111111, a); let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvtepi8_epi32() { - let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + unsafe fn test_mm_mask_cvtepu16_epi32() { + let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15); let src = _mm_set1_epi32(-1); - let r = _mm_mask_cvtepi8_epi32(src, 0, a); + let r = _mm_mask_cvtepu16_epi32(src, 0, a); assert_eq_m128i(r, src); - let r = _mm_mask_cvtepi8_epi32(src, 0b00001111, a); + let r = _mm_mask_cvtepu16_epi32(src, 0b00001111, a); let e = _mm_set_epi32(12, 13, 14, 15); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvtepi8_epi32() { - let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm_maskz_cvtepi8_epi32(0, a); + unsafe fn test_mm_maskz_cvtepu16_epi32() { + let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm_maskz_cvtepu16_epi32(0, a); assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvtepi8_epi32(0b00001111, a); + let r = _mm_maskz_cvtepu16_epi32(0b00001111, a); let e = _mm_set_epi32(12, 13, 14, 15); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtepu8_epi32() { - let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_cvtepu8_epi32(a); - let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i(r, e); + unsafe fn test_mm512_cvtepi32_ps() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_cvtepi32_ps(a); + let e = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtepu8_epi32() { - let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let src = _mm512_set1_epi32(-1); - let r = _mm512_mask_cvtepu8_epi32(src, 0, a); - assert_eq_m512i(r, src); - let r = _mm512_mask_cvtepu8_epi32(src, 0b00000000_11111111, a); - let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i(r, e); + unsafe fn test_mm512_mask_cvtepi32_ps() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let src = _mm512_set1_ps(-1.); + let r = _mm512_mask_cvtepi32_ps(src, 0, a); + assert_eq_m512(r, src); + let r = _mm512_mask_cvtepi32_ps(src, 0b00000000_11111111, a); + let e = _mm512_set_ps( + -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15., + ); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtepu8_epi32() { - let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_maskz_cvtepu8_epi32(0, a); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_cvtepu8_epi32(0b00000000_11111111, a); - let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i(r, e); + unsafe fn test_mm512_maskz_cvtepi32_ps() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_maskz_cvtepi32_ps(0, a); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_cvtepi32_ps(0b00000000_11111111, a); + let e = _mm512_set_ps( + 0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15., + ); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvtepu8_epi32() { - let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let src = _mm256_set1_epi32(-1); - let r = _mm256_mask_cvtepu8_epi32(src, 0, a); + unsafe fn test_mm256_mask_cvtepi32_ps() { + let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let src = _mm256_set1_ps(-1.); + let r = _mm256_mask_cvtepi32_ps(src, 0, a); + assert_eq_m256(r, src); + let r = _mm256_mask_cvtepi32_ps(src, 0b11111111, a); + let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_cvtepi32_ps() { + let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let r = _mm256_maskz_cvtepi32_ps(0, a); + assert_eq_m256(r, _mm256_setzero_ps()); + let r = _mm256_maskz_cvtepi32_ps(0b11111111, a); + let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cvtepi32_ps() { + let a = _mm_set_epi32(1, 2, 3, 4); + let src = _mm_set1_ps(-1.); + let r = _mm_mask_cvtepi32_ps(src, 0, a); + assert_eq_m128(r, src); + let r = _mm_mask_cvtepi32_ps(src, 0b00001111, a); + let e = _mm_set_ps(1., 2., 3., 4.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_cvtepi32_ps() { + let a = _mm_set_epi32(1, 2, 3, 4); + let r = _mm_maskz_cvtepi32_ps(0, a); + assert_eq_m128(r, _mm_setzero_ps()); + let r = _mm_maskz_cvtepi32_ps(0b00001111, a); + let e = _mm_set_ps(1., 2., 3., 4.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvtepu32_ps() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_cvtepu32_ps(a); + let e = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvtepu32_ps() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let src = _mm512_set1_ps(-1.); + let r = _mm512_mask_cvtepu32_ps(src, 0, a); + assert_eq_m512(r, src); + let r = _mm512_mask_cvtepu32_ps(src, 0b00000000_11111111, a); + let e = _mm512_set_ps( + -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvtepu32_ps() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_maskz_cvtepu32_ps(0, a); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_cvtepu32_ps(0b00000000_11111111, a); + let e = _mm512_set_ps( + 0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvtepi32_epi16() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_cvtepi32_epi16(a); + let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvtepi32_epi16() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let src = _mm256_set1_epi16(-1); + let r = _mm512_mask_cvtepi32_epi16(src, 0, a); assert_eq_m256i(r, src); - let r = _mm256_mask_cvtepu8_epi32(src, 0b11111111, a); - let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_mask_cvtepi32_epi16(src, 0b00000000_11111111, a); + let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15); assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvtepu8_epi32() { - let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm256_maskz_cvtepu8_epi32(0, a); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvtepi32_epi16() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_maskz_cvtepi32_epi16(0, a); assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm256_maskz_cvtepu8_epi32(0b11111111, a); - let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_maskz_cvtepi32_epi16(0b00000000_11111111, a); + let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15); assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvtepu8_epi32() { - let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let src = _mm_set1_epi32(-1); - let r = _mm_mask_cvtepu8_epi32(src, 0, a); + unsafe fn test_mm256_cvtepi32_epi16() { + let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm256_cvtepi32_epi16(a); + let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cvtepi32_epi16() { + let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + let src = _mm_set1_epi16(-1); + let r = _mm256_mask_cvtepi32_epi16(src, 0, a); + assert_eq_m128i(r, src); + let r = _mm256_mask_cvtepi32_epi16(src, 0b11111111, a); + let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_cvtepi32_epi16() { + let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm256_maskz_cvtepi32_epi16(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm256_maskz_cvtepi32_epi16(0b11111111, a); + let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cvtepi32_epi16() { + let a = _mm_set_epi32(4, 5, 6, 7); + let r = _mm_cvtepi32_epi16(a); + let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cvtepi32_epi16() { + let a = _mm_set_epi32(4, 5, 6, 7); + let src = _mm_set1_epi16(0); + let r = _mm_mask_cvtepi32_epi16(src, 0, a); assert_eq_m128i(r, src); - let r = _mm_mask_cvtepu8_epi32(src, 0b00001111, a); - let e = _mm_set_epi32(12, 13, 14, 15); + let r = _mm_mask_cvtepi32_epi16(src, 0b00001111, a); + let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvtepu8_epi32() { - let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm_maskz_cvtepu8_epi32(0, a); + unsafe fn test_mm_maskz_cvtepi32_epi16() { + let a = _mm_set_epi32(4, 5, 6, 7); + let r = _mm_maskz_cvtepi32_epi16(0, a); assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvtepu8_epi32(0b00001111, a); - let e = _mm_set_epi32(12, 13, 14, 15); + let r = _mm_maskz_cvtepi32_epi16(0b00001111, a); + let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtepi16_epi32() { - let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_cvtepi16_epi32(a); - let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i(r, e); + unsafe fn test_mm512_cvtepi32_epi8() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_cvtepi32_epi8(a); + let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtepi16_epi32() { - let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let src = _mm512_set1_epi32(-1); - let r = _mm512_mask_cvtepi16_epi32(src, 0, a); - assert_eq_m512i(r, src); - let r = _mm512_mask_cvtepi16_epi32(src, 0b00000000_11111111, a); - let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i(r, e); + unsafe fn test_mm512_mask_cvtepi32_epi8() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let src = _mm_set1_epi8(-1); + let r = _mm512_mask_cvtepi32_epi8(src, 0, a); + assert_eq_m128i(r, src); + let r = _mm512_mask_cvtepi32_epi8(src, 0b00000000_11111111, a); + let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtepi16_epi32() { - let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_maskz_cvtepi16_epi32(0, a); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_cvtepi16_epi32(0b00000000_11111111, a); - let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i(r, e); + unsafe fn test_mm512_maskz_cvtepi32_epi8() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_maskz_cvtepi32_epi8(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm512_maskz_cvtepi32_epi8(0b00000000_11111111, a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvtepi16_epi32() { - let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); - let src = _mm256_set1_epi32(-1); - let r = _mm256_mask_cvtepi16_epi32(src, 0, a); - assert_eq_m256i(r, src); - let r = _mm256_mask_cvtepi16_epi32(src, 0b11111111, a); - let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); - assert_eq_m256i(r, e); + unsafe fn test_mm256_cvtepi32_epi8() { + let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm256_cvtepi32_epi8(a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvtepi16_epi32() { - let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); - let r = _mm256_maskz_cvtepi16_epi32(0, a); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm256_maskz_cvtepi16_epi32(0b11111111, a); - let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); - assert_eq_m256i(r, e); + unsafe fn test_mm256_mask_cvtepi32_epi8() { + let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + let src = _mm_set1_epi8(0); + let r = _mm256_mask_cvtepi32_epi8(src, 0, a); + assert_eq_m128i(r, src); + let r = _mm256_mask_cvtepi32_epi8(src, 0b11111111, a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvtepi16_epi32() { - let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); - let src = _mm_set1_epi32(-1); - let r = _mm_mask_cvtepi16_epi32(src, 0, a); + unsafe fn test_mm256_maskz_cvtepi32_epi8() { + let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm256_maskz_cvtepi32_epi8(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm256_maskz_cvtepi32_epi8(0b11111111, a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cvtepi32_epi8() { + let a = _mm_set_epi32(4, 5, 6, 7); + let r = _mm_cvtepi32_epi8(a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cvtepi32_epi8() { + let a = _mm_set_epi32(4, 5, 6, 7); + let src = _mm_set1_epi8(0); + let r = _mm_mask_cvtepi32_epi8(src, 0, a); assert_eq_m128i(r, src); - let r = _mm_mask_cvtepi16_epi32(src, 0b00001111, a); - let e = _mm_set_epi32(4, 5, 6, 7); + let r = _mm_mask_cvtepi32_epi8(src, 0b00001111, a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvtepi16_epi32() { - let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); - let r = _mm_maskz_cvtepi16_epi32(0, a); + unsafe fn test_mm_maskz_cvtepi32_epi8() { + let a = _mm_set_epi32(4, 5, 6, 7); + let r = _mm_maskz_cvtepi32_epi8(0, a); assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvtepi16_epi32(0b00001111, a); - let e = _mm_set_epi32(4, 5, 6, 7); + let r = _mm_maskz_cvtepi32_epi8(0b00001111, a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtepu16_epi32() { - let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_cvtepu16_epi32(a); - let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i(r, e); + unsafe fn test_mm512_cvtsepi32_epi16() { + #[rustfmt::skip] + let a = _mm512_set_epi32( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, i32::MIN, i32::MAX, + ); + let r = _mm512_cvtsepi32_epi16(a); + #[rustfmt::skip] + let e = _mm256_set_epi16( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, i16::MIN, i16::MAX, + ); + assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtepu16_epi32() { - let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let src = _mm512_set1_epi32(-1); - let r = _mm512_mask_cvtepu16_epi32(src, 0, a); - assert_eq_m512i(r, src); - let r = _mm512_mask_cvtepu16_epi32(src, 0b00000000_11111111, a); - let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i(r, e); + unsafe fn test_mm512_mask_cvtsepi32_epi16() { + #[rustfmt::skip] + let a = _mm512_set_epi32( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, i32::MIN, i32::MAX, + ); + let src = _mm256_set1_epi16(-1); + let r = _mm512_mask_cvtsepi32_epi16(src, 0, a); + assert_eq_m256i(r, src); + let r = _mm512_mask_cvtsepi32_epi16(src, 0b00000000_11111111, a); + #[rustfmt::skip] + let e = _mm256_set_epi16( + -1, -1, -1, -1, + -1, -1, -1, -1, + 8, 9, 10, 11, + 12, 13, i16::MIN, i16::MAX, + ); + assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtepu16_epi32() { - let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_maskz_cvtepu16_epi32(0, a); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_cvtepu16_epi32(0b00000000_11111111, a); - let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i(r, e); + unsafe fn test_mm512_maskz_cvtsepi32_epi16() { + #[rustfmt::skip] + let a = _mm512_set_epi32( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, i32::MIN, i32::MAX, + ); + let r = _mm512_maskz_cvtsepi32_epi16(0, a); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm512_maskz_cvtsepi32_epi16(0b00000000_11111111, a); + #[rustfmt::skip] + let e = _mm256_set_epi16( + 0, 0, 0, 0, + 0, 0, 0, 0, + 8, 9, 10, 11, + 12, 13, i16::MIN, i16::MAX, + ); + assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvtepu16_epi32() { - let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15); - let src = _mm256_set1_epi32(-1); - let r = _mm256_mask_cvtepu16_epi32(src, 0, a); - assert_eq_m256i(r, src); - let r = _mm256_mask_cvtepu16_epi32(src, 0b11111111, a); - let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m256i(r, e); + unsafe fn test_mm256_cvtsepi32_epi16() { + let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm256_cvtsepi32_epi16(a); + let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvtepu16_epi32() { - let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm256_maskz_cvtepu16_epi32(0, a); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm256_maskz_cvtepu16_epi32(0b11111111, a); - let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m256i(r, e); + unsafe fn test_mm256_mask_cvtsepi32_epi16() { + let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + let src = _mm_set1_epi16(-1); + let r = _mm256_mask_cvtsepi32_epi16(src, 0, a); + assert_eq_m128i(r, src); + let r = _mm256_mask_cvtsepi32_epi16(src, 0b11111111, a); + let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_cvtsepi32_epi16() { + let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm256_maskz_cvtsepi32_epi16(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm256_maskz_cvtsepi32_epi16(0b11111111, a); + let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cvtsepi32_epi16() { + let a = _mm_set_epi32(4, 5, 6, 7); + let r = _mm_cvtsepi32_epi16(a); + let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvtepu16_epi32() { - let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15); - let src = _mm_set1_epi32(-1); - let r = _mm_mask_cvtepu16_epi32(src, 0, a); + unsafe fn test_mm_mask_cvtsepi32_epi16() { + let a = _mm_set_epi32(4, 5, 6, 7); + let src = _mm_set1_epi16(0); + let r = _mm_mask_cvtsepi32_epi16(src, 0, a); assert_eq_m128i(r, src); - let r = _mm_mask_cvtepu16_epi32(src, 0b00001111, a); - let e = _mm_set_epi32(12, 13, 14, 15); + let r = _mm_mask_cvtsepi32_epi16(src, 0b11111111, a); + let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvtepu16_epi32() { - let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm_maskz_cvtepu16_epi32(0, a); + unsafe fn test_mm_maskz_cvtsepi32_epi16() { + let a = _mm_set_epi32(4, 5, 6, 7); + let r = _mm_maskz_cvtsepi32_epi16(0, a); assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvtepu16_epi32(0b00001111, a); - let e = _mm_set_epi32(12, 13, 14, 15); + let r = _mm_maskz_cvtsepi32_epi16(0b11111111, a); + let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtepi32_ps() { - let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_cvtepi32_ps(a); - let e = _mm512_set_ps( - 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + unsafe fn test_mm512_cvtsepi32_epi8() { + #[rustfmt::skip] + let a = _mm512_set_epi32( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, i32::MIN, i32::MAX, ); - assert_eq_m512(r, e); + let r = _mm512_cvtsepi32_epi8(a); + #[rustfmt::skip] + let e = _mm_set_epi8( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, i8::MIN, i8::MAX, + ); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtepi32_ps() { - let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let src = _mm512_set1_ps(-1.); - let r = _mm512_mask_cvtepi32_ps(src, 0, a); - assert_eq_m512(r, src); - let r = _mm512_mask_cvtepi32_ps(src, 0b00000000_11111111, a); - let e = _mm512_set_ps( - -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15., + unsafe fn test_mm512_mask_cvtsepi32_epi8() { + #[rustfmt::skip] + let a = _mm512_set_epi32( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, i32::MIN, i32::MAX, ); - assert_eq_m512(r, e); + let src = _mm_set1_epi8(-1); + let r = _mm512_mask_cvtsepi32_epi8(src, 0, a); + assert_eq_m128i(r, src); + let r = _mm512_mask_cvtsepi32_epi8(src, 0b00000000_11111111, a); + #[rustfmt::skip] + let e = _mm_set_epi8( + -1, -1, -1, -1, + -1, -1, -1, -1, + 8, 9, 10, 11, + 12, 13, i8::MIN, i8::MAX, + ); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtepi32_ps() { - let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_maskz_cvtepi32_ps(0, a); - assert_eq_m512(r, _mm512_setzero_ps()); - let r = _mm512_maskz_cvtepi32_ps(0b00000000_11111111, a); - let e = _mm512_set_ps( - 0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15., + unsafe fn test_mm512_maskz_cvtsepi32_epi8() { + #[rustfmt::skip] + let a = _mm512_set_epi32( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, i32::MIN, i32::MAX, ); - assert_eq_m512(r, e); + let r = _mm512_maskz_cvtsepi32_epi8(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm512_maskz_cvtsepi32_epi8(0b00000000_11111111, a); + #[rustfmt::skip] + let e = _mm_set_epi8( + 0, 0, 0, 0, + 0, 0, 0, 0, + 8, 9, 10, 11, + 12, 13, i8::MIN, i8::MAX, + ); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvtepi32_ps() { - let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); - let src = _mm256_set1_ps(-1.); - let r = _mm256_mask_cvtepi32_ps(src, 0, a); - assert_eq_m256(r, src); - let r = _mm256_mask_cvtepi32_ps(src, 0b11111111, a); - let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.); - assert_eq_m256(r, e); + unsafe fn test_mm256_cvtsepi32_epi8() { + let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16); + let r = _mm256_cvtsepi32_epi8(a); + #[rustfmt::skip] + let e = _mm_set_epi8( + 0, 0, 0, 0, + 0, 0, 0, 0, + 9, 10, 11, 12, + 13, 14, 15, 16, + ); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvtepi32_ps() { - let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); - let r = _mm256_maskz_cvtepi32_ps(0, a); - assert_eq_m256(r, _mm256_setzero_ps()); - let r = _mm256_maskz_cvtepi32_ps(0b11111111, a); - let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.); - assert_eq_m256(r, e); + unsafe fn test_mm256_mask_cvtsepi32_epi8() { + let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16); + let src = _mm_set1_epi8(0); + let r = _mm256_mask_cvtsepi32_epi8(src, 0, a); + assert_eq_m128i(r, src); + let r = _mm256_mask_cvtsepi32_epi8(src, 0b11111111, a); + #[rustfmt::skip] + let e = _mm_set_epi8( + 0, 0, 0, 0, + 0, 0, 0, 0, + 9, 10, 11, 12, + 13, 14, 15, 16, + ); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvtepi32_ps() { - let a = _mm_set_epi32(1, 2, 3, 4); - let src = _mm_set1_ps(-1.); - let r = _mm_mask_cvtepi32_ps(src, 0, a); - assert_eq_m128(r, src); - let r = _mm_mask_cvtepi32_ps(src, 0b00001111, a); - let e = _mm_set_ps(1., 2., 3., 4.); - assert_eq_m128(r, e); + unsafe fn test_mm256_maskz_cvtsepi32_epi8() { + let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16); + let r = _mm256_maskz_cvtsepi32_epi8(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm256_maskz_cvtsepi32_epi8(0b11111111, a); + #[rustfmt::skip] + let e = _mm_set_epi8( + 0, 0, 0, 0, + 0, 0, 0, 0, + 9, 10, 11, 12, + 13, 14, 15, 16, + ); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvtepi32_ps() { - let a = _mm_set_epi32(1, 2, 3, 4); - let r = _mm_maskz_cvtepi32_ps(0, a); - assert_eq_m128(r, _mm_setzero_ps()); - let r = _mm_maskz_cvtepi32_ps(0b00001111, a); - let e = _mm_set_ps(1., 2., 3., 4.); - assert_eq_m128(r, e); - } - - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtepu32_ps() { - let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_cvtepu32_ps(a); - let e = _mm512_set_ps( - 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + unsafe fn test_mm_cvtsepi32_epi8() { + let a = _mm_set_epi32(13, 14, 15, 16); + let r = _mm_cvtsepi32_epi8(a); + #[rustfmt::skip] + let e = _mm_set_epi8( + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13, 14, 15, 16, ); - assert_eq_m512(r, e); + assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtepu32_ps() { - let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let src = _mm512_set1_ps(-1.); - let r = _mm512_mask_cvtepu32_ps(src, 0, a); - assert_eq_m512(r, src); - let r = _mm512_mask_cvtepu32_ps(src, 0b00000000_11111111, a); - let e = _mm512_set_ps( - -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15., + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cvtsepi32_epi8() { + let a = _mm_set_epi32(13, 14, 15, 16); + let src = _mm_set1_epi8(0); + let r = _mm_mask_cvtsepi32_epi8(src, 0, a); + assert_eq_m128i(r, src); + let r = _mm_mask_cvtsepi32_epi8(src, 0b00001111, a); + #[rustfmt::skip] + let e = _mm_set_epi8( + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13, 14, 15, 16, ); - assert_eq_m512(r, e); + assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtepu32_ps() { - let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_maskz_cvtepu32_ps(0, a); - assert_eq_m512(r, _mm512_setzero_ps()); - let r = _mm512_maskz_cvtepu32_ps(0b00000000_11111111, a); - let e = _mm512_set_ps( - 0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15., + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_cvtsepi32_epi8() { + let a = _mm_set_epi32(13, 14, 15, 16); + let r = _mm_maskz_cvtsepi32_epi8(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_cvtsepi32_epi8(0b00001111, a); + #[rustfmt::skip] + let e = _mm_set_epi8( + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13, 14, 15, 16, ); - assert_eq_m512(r, e); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtepi32_epi16() { - let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_cvtepi32_epi16(a); - let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + unsafe fn test_mm512_cvtusepi32_epi16() { + #[rustfmt::skip] + let a = _mm512_set_epi32( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, i32::MIN, i32::MIN, + ); + let r = _mm512_cvtusepi32_epi16(a); + let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1); assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtepi32_epi16() { - let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + unsafe fn test_mm512_mask_cvtusepi32_epi16() { + #[rustfmt::skip] + let a = _mm512_set_epi32( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, i32::MIN, i32::MIN, + ); let src = _mm256_set1_epi16(-1); - let r = _mm512_mask_cvtepi32_epi16(src, 0, a); + let r = _mm512_mask_cvtusepi32_epi16(src, 0, a); assert_eq_m256i(r, src); - let r = _mm512_mask_cvtepi32_epi16(src, 0b00000000_11111111, a); - let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_mask_cvtusepi32_epi16(src, 0b00000000_11111111, a); + let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1); assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtepi32_epi16() { - let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_maskz_cvtepi32_epi16(0, a); + unsafe fn test_mm512_maskz_cvtusepi32_epi16() { + #[rustfmt::skip] + let a = _mm512_set_epi32( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, i32::MIN, i32::MIN, + ); + let r = _mm512_maskz_cvtusepi32_epi16(0, a); assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm512_maskz_cvtepi32_epi16(0b00000000_11111111, a); - let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_maskz_cvtusepi32_epi16(0b00000000_11111111, a); + let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1); assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cvtepi32_epi16() { - let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); - let r = _mm256_cvtepi32_epi16(a); - let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + unsafe fn test_mm256_cvtusepi32_epi16() { + let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let r = _mm256_cvtusepi32_epi16(a); + let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvtepi32_epi16() { - let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); - let src = _mm_set1_epi16(-1); - let r = _mm256_mask_cvtepi32_epi16(src, 0, a); + unsafe fn test_mm256_mask_cvtusepi32_epi16() { + let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let src = _mm_set1_epi16(0); + let r = _mm256_mask_cvtusepi32_epi16(src, 0, a); assert_eq_m128i(r, src); - let r = _mm256_mask_cvtepi32_epi16(src, 0b11111111, a); - let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm256_mask_cvtusepi32_epi16(src, 0b11111111, a); + let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvtepi32_epi16() { - let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); - let r = _mm256_maskz_cvtepi32_epi16(0, a); + unsafe fn test_mm256_maskz_cvtusepi32_epi16() { + let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let r = _mm256_maskz_cvtusepi32_epi16(0, a); assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm256_maskz_cvtepi32_epi16(0b11111111, a); - let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm256_maskz_cvtusepi32_epi16(0b11111111, a); + let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cvtepi32_epi16() { - let a = _mm_set_epi32(4, 5, 6, 7); - let r = _mm_cvtepi32_epi16(a); - let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7); + unsafe fn test_mm_cvtusepi32_epi16() { + let a = _mm_set_epi32(5, 6, 7, 8); + let r = _mm_cvtusepi32_epi16(a); + let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvtepi32_epi16() { - let a = _mm_set_epi32(4, 5, 6, 7); + unsafe fn test_mm_mask_cvtusepi32_epi16() { + let a = _mm_set_epi32(5, 6, 7, 8); let src = _mm_set1_epi16(0); - let r = _mm_mask_cvtepi32_epi16(src, 0, a); + let r = _mm_mask_cvtusepi32_epi16(src, 0, a); assert_eq_m128i(r, src); - let r = _mm_mask_cvtepi32_epi16(src, 0b00001111, a); - let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7); + let r = _mm_mask_cvtusepi32_epi16(src, 0b00001111, a); + let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvtepi32_epi16() { - let a = _mm_set_epi32(4, 5, 6, 7); - let r = _mm_maskz_cvtepi32_epi16(0, a); + unsafe fn test_mm_maskz_cvtusepi32_epi16() { + let a = _mm_set_epi32(5, 6, 7, 8); + let r = _mm_maskz_cvtusepi32_epi16(0, a); assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvtepi32_epi16(0b00001111, a); - let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7); + let r = _mm_maskz_cvtusepi32_epi16(0b00001111, a); + let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtepi32_epi8() { - let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_cvtepi32_epi8(a); - let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + unsafe fn test_mm512_cvtusepi32_epi8() { + #[rustfmt::skip] + let a = _mm512_set_epi32( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, i32::MIN, i32::MIN, + ); + let r = _mm512_cvtusepi32_epi8(a); + let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtepi32_epi8() { - let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + unsafe fn test_mm512_mask_cvtusepi32_epi8() { + #[rustfmt::skip] + let a = _mm512_set_epi32( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, i32::MIN, i32::MIN, + ); let src = _mm_set1_epi8(-1); - let r = _mm512_mask_cvtepi32_epi8(src, 0, a); + let r = _mm512_mask_cvtusepi32_epi8(src, 0, a); assert_eq_m128i(r, src); - let r = _mm512_mask_cvtepi32_epi8(src, 0b00000000_11111111, a); - let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_mask_cvtusepi32_epi8(src, 0b00000000_11111111, a); + let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtepi32_epi8() { - let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = _mm512_maskz_cvtepi32_epi8(0, a); + unsafe fn test_mm512_maskz_cvtusepi32_epi8() { + #[rustfmt::skip] + let a = _mm512_set_epi32( + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, i32::MIN, i32::MIN, + ); + let r = _mm512_maskz_cvtusepi32_epi8(0, a); assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm512_maskz_cvtepi32_epi8(0b00000000_11111111, a); - let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_maskz_cvtusepi32_epi8(0b00000000_11111111, a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cvtepi32_epi8() { - let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); - let r = _mm256_cvtepi32_epi8(a); - let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7); + unsafe fn test_mm256_cvtusepi32_epi8() { + let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX); + let r = _mm256_cvtusepi32_epi8(a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvtepi32_epi8() { - let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); + unsafe fn test_mm256_mask_cvtusepi32_epi8() { + let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX); let src = _mm_set1_epi8(0); - let r = _mm256_mask_cvtepi32_epi8(src, 0, a); + let r = _mm256_mask_cvtusepi32_epi8(src, 0, a); assert_eq_m128i(r, src); - let r = _mm256_mask_cvtepi32_epi8(src, 0b11111111, a); - let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm256_mask_cvtusepi32_epi8(src, 0b11111111, a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvtepi32_epi8() { - let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); - let r = _mm256_maskz_cvtepi32_epi8(0, a); + unsafe fn test_mm256_maskz_cvtusepi32_epi8() { + let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX); + let r = _mm256_maskz_cvtusepi32_epi8(0, a); assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm256_maskz_cvtepi32_epi8(0b11111111, a); - let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm256_maskz_cvtusepi32_epi8(0b11111111, a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cvtusepi32_epi8() { + let a = _mm_set_epi32(5, 6, 7, i32::MAX); + let r = _mm_cvtusepi32_epi8(a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cvtusepi32_epi8() { + let a = _mm_set_epi32(5, 6, 7, i32::MAX); + let src = _mm_set1_epi8(0); + let r = _mm_mask_cvtusepi32_epi8(src, 0, a); + assert_eq_m128i(r, src); + let r = _mm_mask_cvtusepi32_epi8(src, 0b00001111, a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_cvtusepi32_epi8() { + let a = _mm_set_epi32(5, 6, 7, i32::MAX); + let r = _mm_maskz_cvtusepi32_epi8(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_cvtusepi32_epi8(0b00001111, a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8); assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cvtepi32_epi8() { - let a = _mm_set_epi32(4, 5, 6, 7); - let r = _mm_cvtepi32_epi8(a); - let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7); - assert_eq_m128i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvt_roundps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a); + let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); + assert_eq_m512i(r, e); + let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a); + let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvt_roundps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let src = _mm512_set1_epi32(0); + let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, 0, a, + ); + assert_eq_m512i(r, src); + let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, + 0b00000000_11111111, + a, + ); + let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvt_roundps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0, a, + ); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0b00000000_11111111, + a, + ); + let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvt_roundps_epu32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16); + assert_eq_m512i(r, e); + let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvt_roundps_epu32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let src = _mm512_set1_epi32(0); + let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, 0, a, + ); + assert_eq_m512i(r, src); + let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, + 0b00000000_11111111, + a, + ); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvt_roundps_epu32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0, a, + ); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0b00000000_11111111, + a, + ); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvtepi32_epi8() { - let a = _mm_set_epi32(4, 5, 6, 7); - let src = _mm_set1_epi8(0); - let r = _mm_mask_cvtepi32_epi8(src, 0, a); - assert_eq_m128i(r, src); - let r = _mm_mask_cvtepi32_epi8(src, 0b00001111, a); - let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7); - assert_eq_m128i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvt_roundepi32_ps() { + let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); + let r = _mm512_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a); + let e = _mm512_setr_ps( + 0., -2., 2., -4., 4., -6., 6., -8., 8., 10., 10., 12., 12., 14., 14., 16., + ); + assert_eq_m512(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvtepi32_epi8() { - let a = _mm_set_epi32(4, 5, 6, 7); - let r = _mm_maskz_cvtepi32_epi8(0, a); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvtepi32_epi8(0b00001111, a); - let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7); - assert_eq_m128i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvt_roundepi32_ps() { + let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); + let src = _mm512_set1_ps(0.); + let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, 0, a, + ); + assert_eq_m512(r, src); + let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, + 0b00000000_11111111, + a, + ); + let e = _mm512_setr_ps( + 0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtsepi32_epi16() { - #[rustfmt::skip] - let a = _mm512_set_epi32( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, i32::MIN, i32::MAX, + unsafe fn test_mm512_maskz_cvt_roundepi32_ps() { + let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); + let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0, a, ); - let r = _mm512_cvtsepi32_epi16(a); - #[rustfmt::skip] - let e = _mm256_set_epi16( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, i16::MIN, i16::MAX, + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0b00000000_11111111, + a, ); - assert_eq_m256i(r, e); + let e = _mm512_setr_ps( + 0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtsepi32_epi16() { + unsafe fn test_mm512_cvt_roundepu32_ps() { + let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); + let r = _mm512_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a); #[rustfmt::skip] - let a = _mm512_set_epi32( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, i32::MIN, i32::MAX, + let e = _mm512_setr_ps( + 0., 4294967300., 2., 4294967300., + 4., 4294967300., 6., 4294967300., + 8., 10., 10., 12., + 12., 14., 14., 16., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvt_roundepu32_ps() { + let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); + let src = _mm512_set1_ps(0.); + let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, 0, a, + ); + assert_eq_m512(r, src); + let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + src, + 0b00000000_11111111, + a, ); - let src = _mm256_set1_epi16(-1); - let r = _mm512_mask_cvtsepi32_epi16(src, 0, a); - assert_eq_m256i(r, src); - let r = _mm512_mask_cvtsepi32_epi16(src, 0b00000000_11111111, a); #[rustfmt::skip] - let e = _mm256_set_epi16( - -1, -1, -1, -1, - -1, -1, -1, -1, - 8, 9, 10, 11, - 12, 13, i16::MIN, i16::MAX, + let e = _mm512_setr_ps( + 0., 4294967300., 2., 4294967300., + 4., 4294967300., 6., 4294967300., + 0., 0., 0., 0., + 0., 0., 0., 0., ); - assert_eq_m256i(r, e); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtsepi32_epi16() { - #[rustfmt::skip] - let a = _mm512_set_epi32( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, i32::MIN, i32::MAX, + unsafe fn test_mm512_maskz_cvt_roundepu32_ps() { + let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); + let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0, a, + ); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( + 0b00000000_11111111, + a, ); - let r = _mm512_maskz_cvtsepi32_epi16(0, a); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm512_maskz_cvtsepi32_epi16(0b00000000_11111111, a); #[rustfmt::skip] - let e = _mm256_set_epi16( - 0, 0, 0, 0, - 0, 0, 0, 0, - 8, 9, 10, 11, - 12, 13, i16::MIN, i16::MAX, + let e = _mm512_setr_ps( + 0., 4294967300., 2., 4294967300., + 4., 4294967300., 6., 4294967300., + 0., 0., 0., 0., + 0., 0., 0., 0., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvt_roundps_ph() { + let a = _mm512_set1_ps(1.); + let r = _mm512_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(a); + let e = _mm256_setr_epi64x( + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, ); assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cvtsepi32_epi16() { - let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); - let r = _mm256_cvtsepi32_epi16(a); - let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); - assert_eq_m128i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvt_roundps_ph() { + let a = _mm512_set1_ps(1.); + let src = _mm256_set1_epi16(0); + let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a); + assert_eq_m256i(r, src); + let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a); + let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0); + assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvtsepi32_epi16() { - let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); - let src = _mm_set1_epi16(-1); - let r = _mm256_mask_cvtsepi32_epi16(src, 0, a); - assert_eq_m128i(r, src); - let r = _mm256_mask_cvtsepi32_epi16(src, 0b11111111, a); - let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); - assert_eq_m128i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvt_roundps_ph() { + let a = _mm512_set1_ps(1.); + let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a); + let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0); + assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvtsepi32_epi16() { - let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); - let r = _mm256_maskz_cvtsepi32_epi16(0, a); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm256_maskz_cvtsepi32_epi16(0b11111111, a); - let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + unsafe fn test_mm256_mask_cvt_roundps_ph() { + let a = _mm256_set1_ps(1.); + let src = _mm_set1_epi16(0); + let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a); + assert_eq_m128i(r, src); + let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a); + let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cvtsepi32_epi16() { - let a = _mm_set_epi32(4, 5, 6, 7); - let r = _mm_cvtsepi32_epi16(a); - let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7); + unsafe fn test_mm256_maskz_cvt_roundps_ph() { + let a = _mm256_set1_ps(1.); + let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a); + let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvtsepi32_epi16() { - let a = _mm_set_epi32(4, 5, 6, 7); + unsafe fn test_mm_mask_cvt_roundps_ph() { + let a = _mm_set1_ps(1.); let src = _mm_set1_epi16(0); - let r = _mm_mask_cvtsepi32_epi16(src, 0, a); + let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a); assert_eq_m128i(r, src); - let r = _mm_mask_cvtsepi32_epi16(src, 0b11111111, a); - let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7); + let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a); + let e = _mm_setr_epi64x(4323521613979991040, 0); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvtsepi32_epi16() { - let a = _mm_set_epi32(4, 5, 6, 7); - let r = _mm_maskz_cvtsepi32_epi16(0, a); + unsafe fn test_mm_maskz_cvt_roundps_ph() { + let a = _mm_set1_ps(1.); + let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a); assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvtsepi32_epi16(0b11111111, a); - let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7); + let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a); + let e = _mm_setr_epi64x(4323521613979991040, 0); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtsepi32_epi8() { - #[rustfmt::skip] - let a = _mm512_set_epi32( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, i32::MIN, i32::MAX, - ); - let r = _mm512_cvtsepi32_epi8(a); - #[rustfmt::skip] - let e = _mm_set_epi8( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, i8::MIN, i8::MAX, + unsafe fn test_mm512_cvtps_ph() { + let a = _mm512_set1_ps(1.); + let r = _mm512_cvtps_ph::<_MM_FROUND_NO_EXC>(a); + let e = _mm256_setr_epi64x( + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, ); - assert_eq_m128i(r, e); + assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtsepi32_epi8() { - #[rustfmt::skip] - let a = _mm512_set_epi32( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, i32::MIN, i32::MAX, - ); - let src = _mm_set1_epi8(-1); - let r = _mm512_mask_cvtsepi32_epi8(src, 0, a); - assert_eq_m128i(r, src); - let r = _mm512_mask_cvtsepi32_epi8(src, 0b00000000_11111111, a); - #[rustfmt::skip] - let e = _mm_set_epi8( - -1, -1, -1, -1, - -1, -1, -1, -1, - 8, 9, 10, 11, - 12, 13, i8::MIN, i8::MAX, - ); - assert_eq_m128i(r, e); + unsafe fn test_mm512_mask_cvtps_ph() { + let a = _mm512_set1_ps(1.); + let src = _mm256_set1_epi16(0); + let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a); + assert_eq_m256i(r, src); + let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a); + let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0); + assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtsepi32_epi8() { - #[rustfmt::skip] - let a = _mm512_set_epi32( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, i32::MIN, i32::MAX, - ); - let r = _mm512_maskz_cvtsepi32_epi8(0, a); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm512_maskz_cvtsepi32_epi8(0b00000000_11111111, a); - #[rustfmt::skip] - let e = _mm_set_epi8( - 0, 0, 0, 0, - 0, 0, 0, 0, - 8, 9, 10, 11, - 12, 13, i8::MIN, i8::MAX, - ); + unsafe fn test_mm512_maskz_cvtps_ph() { + let a = _mm512_set1_ps(1.); + let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a); + let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cvtps_ph() { + let a = _mm256_set1_ps(1.); + let src = _mm_set1_epi16(0); + let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a); + assert_eq_m128i(r, src); + let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a); + let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cvtsepi32_epi8() { - let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16); - let r = _mm256_cvtsepi32_epi8(a); - #[rustfmt::skip] - let e = _mm_set_epi8( - 0, 0, 0, 0, - 0, 0, 0, 0, - 9, 10, 11, 12, - 13, 14, 15, 16, - ); + unsafe fn test_mm256_maskz_cvtps_ph() { + let a = _mm256_set1_ps(1.); + let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a); + let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvtsepi32_epi8() { - let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16); - let src = _mm_set1_epi8(0); - let r = _mm256_mask_cvtsepi32_epi8(src, 0, a); + unsafe fn test_mm_mask_cvtps_ph() { + let a = _mm_set1_ps(1.); + let src = _mm_set1_epi16(0); + let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a); assert_eq_m128i(r, src); - let r = _mm256_mask_cvtsepi32_epi8(src, 0b11111111, a); - #[rustfmt::skip] - let e = _mm_set_epi8( - 0, 0, 0, 0, - 0, 0, 0, 0, - 9, 10, 11, 12, - 13, 14, 15, 16, - ); + let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a); + let e = _mm_setr_epi64x(4323521613979991040, 0); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvtsepi32_epi8() { - let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16); - let r = _mm256_maskz_cvtsepi32_epi8(0, a); + unsafe fn test_mm_maskz_cvtps_ph() { + let a = _mm_set1_ps(1.); + let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a); assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm256_maskz_cvtsepi32_epi8(0b11111111, a); - #[rustfmt::skip] - let e = _mm_set_epi8( - 0, 0, 0, 0, - 0, 0, 0, 0, - 9, 10, 11, 12, - 13, 14, 15, 16, - ); + let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a); + let e = _mm_setr_epi64x(4323521613979991040, 0); assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cvtsepi32_epi8() { - let a = _mm_set_epi32(13, 14, 15, 16); - let r = _mm_cvtsepi32_epi8(a); - #[rustfmt::skip] - let e = _mm_set_epi8( - 0, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0, 0, - 13, 14, 15, 16, + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvt_roundph_ps() { + let a = _mm256_setr_epi64x( + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, ); - assert_eq_m128i(r, e); + let r = _mm512_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(a); + let e = _mm512_set1_ps(1.); + assert_eq_m512(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvtsepi32_epi8() { - let a = _mm_set_epi32(13, 14, 15, 16); - let src = _mm_set1_epi8(0); - let r = _mm_mask_cvtsepi32_epi8(src, 0, a); - assert_eq_m128i(r, src); - let r = _mm_mask_cvtsepi32_epi8(src, 0b00001111, a); - #[rustfmt::skip] - let e = _mm_set_epi8( - 0, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0, 0, - 13, 14, 15, 16, + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvt_roundph_ps() { + let a = _mm256_setr_epi64x( + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, ); - assert_eq_m128i(r, e); + let src = _mm512_set1_ps(0.); + let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0, a); + assert_eq_m512(r, src); + let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a); + let e = _mm512_setr_ps( + 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvtsepi32_epi8() { - let a = _mm_set_epi32(13, 14, 15, 16); - let r = _mm_maskz_cvtsepi32_epi8(0, a); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvtsepi32_epi8(0b00001111, a); - #[rustfmt::skip] - let e = _mm_set_epi8( - 0, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0, 0, - 13, 14, 15, 16, + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvt_roundph_ps() { + let a = _mm256_setr_epi64x( + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, ); - assert_eq_m128i(r, e); + let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0, a); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a); + let e = _mm512_setr_ps( + 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtusepi32_epi16() { - #[rustfmt::skip] - let a = _mm512_set_epi32( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, i32::MIN, i32::MIN, + unsafe fn test_mm512_cvtph_ps() { + let a = _mm256_setr_epi64x( + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, ); - let r = _mm512_cvtusepi32_epi16(a); - let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1); - assert_eq_m256i(r, e); + let r = _mm512_cvtph_ps(a); + let e = _mm512_set1_ps(1.); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtusepi32_epi16() { - #[rustfmt::skip] - let a = _mm512_set_epi32( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, i32::MIN, i32::MIN, + unsafe fn test_mm512_mask_cvtph_ps() { + let a = _mm256_setr_epi64x( + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, + ); + let src = _mm512_set1_ps(0.); + let r = _mm512_mask_cvtph_ps(src, 0, a); + assert_eq_m512(r, src); + let r = _mm512_mask_cvtph_ps(src, 0b00000000_11111111, a); + let e = _mm512_setr_ps( + 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., ); - let src = _mm256_set1_epi16(-1); - let r = _mm512_mask_cvtusepi32_epi16(src, 0, a); - assert_eq_m256i(r, src); - let r = _mm512_mask_cvtusepi32_epi16(src, 0b00000000_11111111, a); - let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1); - assert_eq_m256i(r, e); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtusepi32_epi16() { - #[rustfmt::skip] - let a = _mm512_set_epi32( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, i32::MIN, i32::MIN, + unsafe fn test_mm512_maskz_cvtph_ps() { + let a = _mm256_setr_epi64x( + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, + 4323521613979991040, ); - let r = _mm512_maskz_cvtusepi32_epi16(0, a); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm512_maskz_cvtusepi32_epi16(0b00000000_11111111, a); - let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1); - assert_eq_m256i(r, e); + let r = _mm512_maskz_cvtph_ps(0, a); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_cvtph_ps(0b00000000_11111111, a); + let e = _mm512_setr_ps( + 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cvtusepi32_epi16() { - let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); - let r = _mm256_cvtusepi32_epi16(a); - let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8); - assert_eq_m128i(r, e); + unsafe fn test_mm256_mask_cvtph_ps() { + let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); + let src = _mm256_set1_ps(0.); + let r = _mm256_mask_cvtph_ps(src, 0, a); + assert_eq_m256(r, src); + let r = _mm256_mask_cvtph_ps(src, 0b11111111, a); + let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.); + assert_eq_m256(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvtusepi32_epi16() { - let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); - let src = _mm_set1_epi16(0); - let r = _mm256_mask_cvtusepi32_epi16(src, 0, a); - assert_eq_m128i(r, src); - let r = _mm256_mask_cvtusepi32_epi16(src, 0b11111111, a); - let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8); - assert_eq_m128i(r, e); + unsafe fn test_mm256_maskz_cvtph_ps() { + let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); + let r = _mm256_maskz_cvtph_ps(0, a); + assert_eq_m256(r, _mm256_setzero_ps()); + let r = _mm256_maskz_cvtph_ps(0b11111111, a); + let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.); + assert_eq_m256(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvtusepi32_epi16() { - let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); - let r = _mm256_maskz_cvtusepi32_epi16(0, a); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm256_maskz_cvtusepi32_epi16(0b11111111, a); - let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8); - assert_eq_m128i(r, e); + unsafe fn test_mm_mask_cvtph_ps() { + let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); + let src = _mm_set1_ps(0.); + let r = _mm_mask_cvtph_ps(src, 0, a); + assert_eq_m128(r, src); + let r = _mm_mask_cvtph_ps(src, 0b00001111, a); + let e = _mm_setr_ps(1., 1., 1., 1.); + assert_eq_m128(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cvtusepi32_epi16() { - let a = _mm_set_epi32(5, 6, 7, 8); - let r = _mm_cvtusepi32_epi16(a); - let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8); - assert_eq_m128i(r, e); + unsafe fn test_mm_maskz_cvtph_ps() { + let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); + let r = _mm_maskz_cvtph_ps(0, a); + assert_eq_m128(r, _mm_setzero_ps()); + let r = _mm_maskz_cvtph_ps(0b00001111, a); + let e = _mm_setr_ps(1., 1., 1., 1.); + assert_eq_m128(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvtusepi32_epi16() { - let a = _mm_set_epi32(5, 6, 7, 8); - let src = _mm_set1_epi16(0); - let r = _mm_mask_cvtusepi32_epi16(src, 0, a); - assert_eq_m128i(r, src); - let r = _mm_mask_cvtusepi32_epi16(src, 0b00001111, a); - let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8); - assert_eq_m128i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvtt_roundps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(a); + let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvtusepi32_epi16() { - let a = _mm_set_epi32(5, 6, 7, 8); - let r = _mm_maskz_cvtusepi32_epi16(0, a); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvtusepi32_epi16(0b00001111, a); - let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8); - assert_eq_m128i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvtt_roundps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let src = _mm512_set1_epi32(0); + let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a); + let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtusepi32_epi8() { - #[rustfmt::skip] - let a = _mm512_set_epi32( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, i32::MIN, i32::MIN, + unsafe fn test_mm512_maskz_cvtt_roundps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); - let r = _mm512_cvtusepi32_epi8(a); - let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1); - assert_eq_m128i(r, e); + let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a); + let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtusepi32_epi8() { - #[rustfmt::skip] - let a = _mm512_set_epi32( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, i32::MIN, i32::MIN, + unsafe fn test_mm512_cvtt_roundps_epu32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); - let src = _mm_set1_epi8(-1); - let r = _mm512_mask_cvtusepi32_epi8(src, 0, a); - assert_eq_m128i(r, src); - let r = _mm512_mask_cvtusepi32_epi8(src, 0b00000000_11111111, a); - let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1); - assert_eq_m128i(r, e); + let r = _mm512_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(a); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtusepi32_epi8() { - #[rustfmt::skip] - let a = _mm512_set_epi32( - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, i32::MIN, i32::MIN, + unsafe fn test_mm512_mask_cvtt_roundps_epu32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); - let r = _mm512_maskz_cvtusepi32_epi8(0, a); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm512_maskz_cvtusepi32_epi8(0b00000000_11111111, a); - let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1); - assert_eq_m128i(r, e); + let src = _mm512_set1_epi32(0); + let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cvtusepi32_epi8() { - let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX); - let r = _mm256_cvtusepi32_epi8(a); - let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8); - assert_eq_m128i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvtt_roundps_epu32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvtusepi32_epi8() { - let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX); - let src = _mm_set1_epi8(0); - let r = _mm256_mask_cvtusepi32_epi8(src, 0, a); - assert_eq_m128i(r, src); - let r = _mm256_mask_cvtusepi32_epi8(src, 0b11111111, a); - let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8); - assert_eq_m128i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvttps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_cvttps_epi32(a); + let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvttps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let src = _mm512_set1_epi32(0); + let r = _mm512_mask_cvttps_epi32(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_cvttps_epi32(src, 0b00000000_11111111, a); + let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvttps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_maskz_cvttps_epi32(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_cvttps_epi32(0b00000000_11111111, a); + let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvtusepi32_epi8() { - let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX); - let r = _mm256_maskz_cvtusepi32_epi8(0, a); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm256_maskz_cvtusepi32_epi8(0b11111111, a); - let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8); - assert_eq_m128i(r, e); + unsafe fn test_mm256_mask_cvttps_epi32() { + let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5); + let src = _mm256_set1_epi32(0); + let r = _mm256_mask_cvttps_epi32(src, 0, a); + assert_eq_m256i(r, src); + let r = _mm256_mask_cvttps_epi32(src, 0b11111111, a); + let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cvtusepi32_epi8() { - let a = _mm_set_epi32(5, 6, 7, i32::MAX); - let r = _mm_cvtusepi32_epi8(a); - let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8); - assert_eq_m128i(r, e); + unsafe fn test_mm256_maskz_cvttps_epi32() { + let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5); + let r = _mm256_maskz_cvttps_epi32(0, a); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_cvttps_epi32(0b11111111, a); + let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvtusepi32_epi8() { - let a = _mm_set_epi32(5, 6, 7, i32::MAX); - let src = _mm_set1_epi8(0); - let r = _mm_mask_cvtusepi32_epi8(src, 0, a); + unsafe fn test_mm_mask_cvttps_epi32() { + let a = _mm_set_ps(12., 13.5, 14., 15.5); + let src = _mm_set1_epi32(0); + let r = _mm_mask_cvttps_epi32(src, 0, a); assert_eq_m128i(r, src); - let r = _mm_mask_cvtusepi32_epi8(src, 0b00001111, a); - let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8); + let r = _mm_mask_cvttps_epi32(src, 0b00001111, a); + let e = _mm_set_epi32(12, 13, 14, 15); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvtusepi32_epi8() { - let a = _mm_set_epi32(5, 6, 7, i32::MAX); - let r = _mm_maskz_cvtusepi32_epi8(0, a); + unsafe fn test_mm_maskz_cvttps_epi32() { + let a = _mm_set_ps(12., 13.5, 14., 15.5); + let r = _mm_maskz_cvttps_epi32(0, a); assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvtusepi32_epi8(0b00001111, a); - let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8); + let r = _mm_maskz_cvttps_epi32(0b00001111, a); + let e = _mm_set_epi32(12, 13, 14, 15); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvt_roundps_epi32() { + unsafe fn test_mm512_cvttps_epu32() { let a = _mm512_setr_ps( 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); - let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a); - let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); - assert_eq_m512i(r, e); - let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a); - let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_cvttps_epu32(a); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvt_roundps_epi32() { + unsafe fn test_mm512_mask_cvttps_epu32() { let a = _mm512_setr_ps( 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); let src = _mm512_set1_epi32(0); - let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - src, 0, a, - ); + let r = _mm512_mask_cvttps_epu32(src, 0, a); assert_eq_m512i(r, src); - let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - src, - 0b00000000_11111111, - a, - ); - let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0); + let r = _mm512_mask_cvttps_epu32(src, 0b00000000_11111111, a); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvt_roundps_epi32() { + unsafe fn test_mm512_maskz_cvttps_epu32() { let a = _mm512_setr_ps( 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); - let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - 0, a, - ); + let r = _mm512_maskz_cvttps_epu32(0, a); assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - 0b00000000_11111111, - a, - ); - let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0); + let r = _mm512_maskz_cvttps_epu32(0b00000000_11111111, a); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); assert_eq_m512i(r, e); } + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_cvttps_epu32() { + let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5); + let r = _mm256_cvttps_epu32(a); + let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cvttps_epu32() { + let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5); + let src = _mm256_set1_epi32(0); + let r = _mm256_mask_cvttps_epu32(src, 0, a); + assert_eq_m256i(r, src); + let r = _mm256_mask_cvttps_epu32(src, 0b11111111, a); + let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_cvttps_epu32() { + let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5); + let r = _mm256_maskz_cvttps_epu32(0, a); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_cvttps_epu32(0b11111111, a); + let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cvttps_epu32() { + let a = _mm_set_ps(12., 13.5, 14., 15.5); + let r = _mm_cvttps_epu32(a); + let e = _mm_set_epi32(12, 13, 14, 15); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cvttps_epu32() { + let a = _mm_set_ps(12., 13.5, 14., 15.5); + let src = _mm_set1_epi32(0); + let r = _mm_mask_cvttps_epu32(src, 0, a); + assert_eq_m128i(r, src); + let r = _mm_mask_cvttps_epu32(src, 0b00001111, a); + let e = _mm_set_epi32(12, 13, 14, 15); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_cvttps_epu32() { + let a = _mm_set_ps(12., 13.5, 14., 15.5); + let r = _mm_maskz_cvttps_epu32(0, a); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_cvttps_epu32(0b00001111, a); + let e = _mm_set_epi32(12, 13, 14, 15); + assert_eq_m128i(r, e); + } + #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvt_roundps_epu32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a); - let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16); - assert_eq_m512i(r, e); - let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a); - let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i(r, e); + unsafe fn test_mm512_i32gather_ps() { + let mut arr = [0f32; 256]; + for i in 0..256 { + arr[i] = i as f32; + } + // A multiplier of 4 is word-addressing + #[rustfmt::skip] + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 120, 128, 136, 144, 152, 160, 168, 176); + let r = _mm512_i32gather_ps::<4>(index, arr.as_ptr() as *const u8); + #[rustfmt::skip] + assert_eq_m512(r, _mm512_setr_ps(0., 16., 32., 48., 64., 80., 96., 112., + 120., 128., 136., 144., 152., 160., 168., 176.)); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvt_roundps_epu32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let src = _mm512_set1_epi32(0); - let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - src, 0, a, + unsafe fn test_mm512_mask_i32gather_ps() { + let mut arr = [0f32; 256]; + for i in 0..256 { + arr[i] = i as f32; + } + let src = _mm512_set1_ps(2.); + let mask = 0b10101010_10101010; + #[rustfmt::skip] + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 120, 128, 136, 144, 152, 160, 168, 176); + // A multiplier of 4 is word-addressing + let r = _mm512_mask_i32gather_ps::<4>(src, mask, index, arr.as_ptr() as *const u8); + #[rustfmt::skip] + assert_eq_m512(r, _mm512_setr_ps(2., 16., 2., 48., 2., 80., 2., 112., + 2., 128., 2., 144., 2., 160., 2., 176.)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i32gather_epi32() { + let mut arr = [0i32; 256]; + for i in 0..256 { + arr[i] = i as i32; + } + // A multiplier of 4 is word-addressing + #[rustfmt::skip] + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 120, 128, 136, 144, 152, 160, 168, 176); + let r = _mm512_i32gather_epi32::<4>(index, arr.as_ptr() as *const u8); + #[rustfmt::skip] + assert_eq_m512i(r, _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 120, 128, 136, 144, 152, 160, 168, 176)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32gather_epi32() { + let mut arr = [0i32; 256]; + for i in 0..256 { + arr[i] = i as i32; + } + let src = _mm512_set1_epi32(2); + let mask = 0b10101010_10101010; + let index = _mm512_setr_epi32( + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, ); - assert_eq_m512i(r, src); - let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - src, - 0b00000000_11111111, - a, + // A multiplier of 4 is word-addressing + let r = _mm512_mask_i32gather_epi32::<4>(src, mask, index, arr.as_ptr() as *const u8); + assert_eq_m512i( + r, + _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112, 2, 144, 2, 176, 2, 208, 2, 240), ); - let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); - assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvt_roundps_epu32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - 0, a, + unsafe fn test_mm512_i32scatter_ps() { + let mut arr = [0f32; 256]; + #[rustfmt::skip] + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 240); + let src = _mm512_setr_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., ); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - 0b00000000_11111111, - a, + // A multiplier of 4 is word-addressing + _mm512_i32scatter_ps::<4>(arr.as_mut_ptr() as *mut u8, index, src); + let mut expected = [0f32; 256]; + for i in 0..16 { + expected[i * 16] = (i + 1) as f32; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32scatter_ps() { + let mut arr = [0f32; 256]; + let mask = 0b10101010_10101010; + #[rustfmt::skip] + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 240); + let src = _mm512_setr_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., ); - let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); - assert_eq_m512i(r, e); + // A multiplier of 4 is word-addressing + _mm512_mask_i32scatter_ps::<4>(arr.as_mut_ptr() as *mut u8, mask, index, src); + let mut expected = [0f32; 256]; + for i in 0..8 { + expected[i * 32 + 16] = 2. * (i + 1) as f32; + } + assert_eq!(&arr[..], &expected[..],); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvt_roundepi32_ps() { - let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); - let r = _mm512_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a); - let e = _mm512_setr_ps( - 0., -2., 2., -4., 4., -6., 6., -8., 8., 10., 10., 12., 12., 14., 14., 16., - ); - assert_eq_m512(r, e); + unsafe fn test_mm512_i32scatter_epi32() { + let mut arr = [0i32; 256]; + #[rustfmt::skip] + + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 240); + let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + // A multiplier of 4 is word-addressing + _mm512_i32scatter_epi32::<4>(arr.as_mut_ptr() as *mut u8, index, src); + let mut expected = [0i32; 256]; + for i in 0..16 { + expected[i * 16] = (i + 1) as i32; + } + assert_eq!(&arr[..], &expected[..],); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvt_roundepi32_ps() { - let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); - let src = _mm512_set1_ps(0.); - let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - src, 0, a, - ); - assert_eq_m512(r, src); - let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - src, - 0b00000000_11111111, - a, - ); - let e = _mm512_setr_ps( - 0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0., - ); - assert_eq_m512(r, e); + unsafe fn test_mm512_mask_i32scatter_epi32() { + let mut arr = [0i32; 256]; + let mask = 0b10101010_10101010; + #[rustfmt::skip] + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 240); + let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + // A multiplier of 4 is word-addressing + _mm512_mask_i32scatter_epi32::<4>(arr.as_mut_ptr() as *mut u8, mask, index, src); + let mut expected = [0i32; 256]; + for i in 0..8 { + expected[i * 32 + 16] = 2 * (i + 1) as i32; + } + assert_eq!(&arr[..], &expected[..],); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvt_roundepi32_ps() { - let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); - let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - 0, a, - ); - assert_eq_m512(r, _mm512_setzero_ps()); - let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - 0b00000000_11111111, - a, - ); - let e = _mm512_setr_ps( - 0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0., - ); - assert_eq_m512(r, e); + unsafe fn test_mm512_cmplt_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let m = _mm512_cmplt_ps_mask(a, b); + assert_eq!(m, 0b00000101_00000101); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvt_roundepu32_ps() { - let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); - let r = _mm512_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a); + unsafe fn test_mm512_mask_cmplt_ps_mask() { #[rustfmt::skip] - let e = _mm512_setr_ps( - 0., 4294967300., 2., 4294967300., - 4., 4294967300., 6., 4294967300., - 8., 10., 10., 12., - 12., 14., 14., 16., - ); - assert_eq_m512(r, e); + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let mask = 0b01100110_01100110; + let r = _mm512_mask_cmplt_ps_mask(mask, a, b); + assert_eq!(r, 0b00000100_00000100); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvt_roundepu32_ps() { - let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); - let src = _mm512_set1_ps(0.); - let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - src, 0, a, - ); - assert_eq_m512(r, src); - let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - src, - 0b00000000_11111111, - a, - ); + unsafe fn test_mm512_cmpnlt_ps_mask() { #[rustfmt::skip] - let e = _mm512_setr_ps( - 0., 4294967300., 2., 4294967300., - 4., 4294967300., 6., 4294967300., - 0., 0., 0., 0., - 0., 0., 0., 0., - ); - assert_eq_m512(r, e); + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + assert_eq!(_mm512_cmpnlt_ps_mask(a, b), !_mm512_cmplt_ps_mask(a, b)); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvt_roundepu32_ps() { - let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); - let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - 0, a, - ); - assert_eq_m512(r, _mm512_setzero_ps()); - let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( - 0b00000000_11111111, - a, - ); + unsafe fn test_mm512_mask_cmpnlt_ps_mask() { #[rustfmt::skip] - let e = _mm512_setr_ps( - 0., 4294967300., 2., 4294967300., - 4., 4294967300., 6., 4294967300., - 0., 0., 0., 0., - 0., 0., 0., 0., - ); - assert_eq_m512(r, e); + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let mask = 0b01111010_01111010; + assert_eq!(_mm512_mask_cmpnlt_ps_mask(mask, a, b), 0b01111010_01111010); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvt_roundps_ph() { - let a = _mm512_set1_ps(1.); - let r = _mm512_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(a); - let e = _mm256_setr_epi64x( - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - ); - assert_eq_m256i(r, e); + unsafe fn test_mm512_cmpnle_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let m = _mm512_cmpnle_ps_mask(b, a); + assert_eq!(m, 0b00001101_00001101); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvt_roundps_ph() { - let a = _mm512_set1_ps(1.); - let src = _mm256_set1_epi16(0); - let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a); - assert_eq_m256i(r, src); - let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a); - let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0); - assert_eq_m256i(r, e); + unsafe fn test_mm512_mask_cmpnle_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let mask = 0b01100110_01100110; + let r = _mm512_mask_cmpnle_ps_mask(mask, b, a); + assert_eq!(r, 0b00000100_00000100); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvt_roundps_ph() { - let a = _mm512_set1_ps(1.); - let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a); - let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0); - assert_eq_m256i(r, e); + unsafe fn test_mm512_cmple_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + assert_eq!(_mm512_cmple_ps_mask(a, b), 0b00100101_00100101); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvt_roundps_ph() { - let a = _mm256_set1_ps(1.); - let src = _mm_set1_epi16(0); - let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a); - assert_eq_m128i(r, src); - let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a); - let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); - assert_eq_m128i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmple_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let mask = 0b01111010_01111010; + assert_eq!(_mm512_mask_cmple_ps_mask(mask, a, b), 0b00100000_00100000); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvt_roundps_ph() { - let a = _mm256_set1_ps(1.); - let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a); - let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); - assert_eq_m128i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpeq_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.); + #[rustfmt::skip] + let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.); + let m = _mm512_cmpeq_ps_mask(b, a); + assert_eq!(m, 0b11001101_11001101); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvt_roundps_ph() { - let a = _mm_set1_ps(1.); - let src = _mm_set1_epi16(0); - let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a); - assert_eq_m128i(r, src); - let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a); - let e = _mm_setr_epi64x(4323521613979991040, 0); - assert_eq_m128i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpeq_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.); + #[rustfmt::skip] + let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.); + let mask = 0b01111010_01111010; + let r = _mm512_mask_cmpeq_ps_mask(mask, b, a); + assert_eq!(r, 0b01001000_01001000); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvt_roundps_ph() { - let a = _mm_set1_ps(1.); - let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a); - let e = _mm_setr_epi64x(4323521613979991040, 0); - assert_eq_m128i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpneq_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.); + #[rustfmt::skip] + let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.); + let m = _mm512_cmpneq_ps_mask(b, a); + assert_eq!(m, 0b00110010_00110010); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtps_ph() { - let a = _mm512_set1_ps(1.); - let r = _mm512_cvtps_ph::<_MM_FROUND_NO_EXC>(a); - let e = _mm256_setr_epi64x( - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - ); - assert_eq_m256i(r, e); + unsafe fn test_mm512_mask_cmpneq_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.); + #[rustfmt::skip] + let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.); + let mask = 0b01111010_01111010; + let r = _mm512_mask_cmpneq_ps_mask(mask, b, a); + assert_eq!(r, 0b00110010_00110010) } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtps_ph() { - let a = _mm512_set1_ps(1.); - let src = _mm256_set1_epi16(0); - let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a); - assert_eq_m256i(r, src); - let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a); - let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0); - assert_eq_m256i(r, e); + unsafe fn test_mm512_cmp_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let m = _mm512_cmp_ps_mask::<_CMP_LT_OQ>(a, b); + assert_eq!(m, 0b00000101_00000101); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtps_ph() { - let a = _mm512_set1_ps(1.); - let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a); - let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0); - assert_eq_m256i(r, e); + unsafe fn test_mm512_mask_cmp_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let mask = 0b01100110_01100110; + let r = _mm512_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b); + assert_eq!(r, 0b00000100_00000100); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvtps_ph() { - let a = _mm256_set1_ps(1.); - let src = _mm_set1_epi16(0); - let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a); - assert_eq_m128i(r, src); - let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a); - let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); - assert_eq_m128i(r, e); + unsafe fn test_mm256_cmp_ps_mask() { + let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); + let b = _mm256_set1_ps(-1.); + let m = _mm256_cmp_ps_mask::<_CMP_LT_OQ>(a, b); + assert_eq!(m, 0b00000101); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvtps_ph() { - let a = _mm256_set1_ps(1.); - let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a); - let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); - assert_eq_m128i(r, e); + unsafe fn test_mm256_mask_cmp_ps_mask() { + let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); + let b = _mm256_set1_ps(-1.); + let mask = 0b01100110; + let r = _mm256_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b); + assert_eq!(r, 0b00000100); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvtps_ph() { - let a = _mm_set1_ps(1.); - let src = _mm_set1_epi16(0); - let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a); - assert_eq_m128i(r, src); - let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a); - let e = _mm_setr_epi64x(4323521613979991040, 0); - assert_eq_m128i(r, e); + unsafe fn test_mm_cmp_ps_mask() { + let a = _mm_set_ps(0., 1., -1., 13.); + let b = _mm_set1_ps(1.); + let m = _mm_cmp_ps_mask::<_CMP_LT_OQ>(a, b); + assert_eq!(m, 0b00001010); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvtps_ph() { - let a = _mm_set1_ps(1.); - let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a); - let e = _mm_setr_epi64x(4323521613979991040, 0); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvt_roundph_ps() { - let a = _mm256_setr_epi64x( - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - ); - let r = _mm512_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(a); - let e = _mm512_set1_ps(1.); - assert_eq_m512(r, e); + unsafe fn test_mm_mask_cmp_ps_mask() { + let a = _mm_set_ps(0., 1., -1., 13.); + let b = _mm_set1_ps(1.); + let mask = 0b11111111; + let r = _mm_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b); + assert_eq!(r, 0b00001010); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvt_roundph_ps() { - let a = _mm256_setr_epi64x( - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - ); - let src = _mm512_set1_ps(0.); - let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0, a); - assert_eq_m512(r, src); - let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a); - let e = _mm512_setr_ps( - 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., - ); - assert_eq_m512(r, e); + unsafe fn test_mm512_cmp_round_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let m = _mm512_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(a, b); + assert_eq!(m, 0b00000101_00000101); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvt_roundph_ps() { - let a = _mm256_setr_epi64x( - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - ); - let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0, a); - assert_eq_m512(r, _mm512_setzero_ps()); - let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a); - let e = _mm512_setr_ps( - 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., - ); - assert_eq_m512(r, e); + unsafe fn test_mm512_mask_cmp_round_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let mask = 0b01100110_01100110; + let r = _mm512_mask_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(mask, a, b); + assert_eq!(r, 0b00000100_00000100); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtph_ps() { - let a = _mm256_setr_epi64x( - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - ); - let r = _mm512_cvtph_ps(a); - let e = _mm512_set1_ps(1.); - assert_eq_m512(r, e); + unsafe fn test_mm512_cmpord_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0., + f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.); + #[rustfmt::skip] + let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0., + f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.); + let m = _mm512_cmpord_ps_mask(a, b); + assert_eq!(m, 0b00000101_00000101); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtph_ps() { - let a = _mm256_setr_epi64x( - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - ); - let src = _mm512_set1_ps(0.); - let r = _mm512_mask_cvtph_ps(src, 0, a); - assert_eq_m512(r, src); - let r = _mm512_mask_cvtph_ps(src, 0b00000000_11111111, a); - let e = _mm512_setr_ps( - 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., - ); - assert_eq_m512(r, e); + unsafe fn test_mm512_mask_cmpord_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0., + f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.); + #[rustfmt::skip] + let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0., + f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.); + let mask = 0b11000011_11000011; + let m = _mm512_mask_cmpord_ps_mask(mask, a, b); + assert_eq!(m, 0b00000001_00000001); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtph_ps() { - let a = _mm256_setr_epi64x( - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - 4323521613979991040, - ); - let r = _mm512_maskz_cvtph_ps(0, a); - assert_eq_m512(r, _mm512_setzero_ps()); - let r = _mm512_maskz_cvtph_ps(0b00000000_11111111, a); - let e = _mm512_setr_ps( - 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., - ); - assert_eq_m512(r, e); - } - - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvtph_ps() { - let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); - let src = _mm256_set1_ps(0.); - let r = _mm256_mask_cvtph_ps(src, 0, a); - assert_eq_m256(r, src); - let r = _mm256_mask_cvtph_ps(src, 0b11111111, a); - let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.); - assert_eq_m256(r, e); - } - - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvtph_ps() { - let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); - let r = _mm256_maskz_cvtph_ps(0, a); - assert_eq_m256(r, _mm256_setzero_ps()); - let r = _mm256_maskz_cvtph_ps(0b11111111, a); - let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.); - assert_eq_m256(r, e); - } - - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvtph_ps() { - let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); - let src = _mm_set1_ps(0.); - let r = _mm_mask_cvtph_ps(src, 0, a); - assert_eq_m128(r, src); - let r = _mm_mask_cvtph_ps(src, 0b00001111, a); - let e = _mm_setr_ps(1., 1., 1., 1.); - assert_eq_m128(r, e); - } + unsafe fn test_mm512_cmpunord_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0., + f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.); + #[rustfmt::skip] + let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0., + f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.); + let m = _mm512_cmpunord_ps_mask(a, b); - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvtph_ps() { - let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040); - let r = _mm_maskz_cvtph_ps(0, a); - assert_eq_m128(r, _mm_setzero_ps()); - let r = _mm_maskz_cvtph_ps(0b00001111, a); - let e = _mm_setr_ps(1., 1., 1., 1.); - assert_eq_m128(r, e); + assert_eq!(m, 0b11111010_11111010); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtt_roundps_epi32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let r = _mm512_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(a); - let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i(r, e); + unsafe fn test_mm512_mask_cmpunord_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0., + f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.); + #[rustfmt::skip] + let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0., + f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.); + let mask = 0b00001111_00001111; + let m = _mm512_mask_cmpunord_ps_mask(mask, a, b); + assert_eq!(m, 0b000001010_00001010); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtt_roundps_epi32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let src = _mm512_set1_epi32(0); - let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0, a); - assert_eq_m512i(r, src); - let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a); - let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0); - assert_eq_m512i(r, e); + unsafe fn test_mm_cmp_ss_mask() { + let a = _mm_setr_ps(2., 1., 1., 1.); + let b = _mm_setr_ps(1., 2., 2., 2.); + let m = _mm_cmp_ss_mask::<_CMP_GE_OS>(a, b); + assert_eq!(m, 1); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtt_roundps_epi32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0, a); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a); - let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0); - assert_eq_m512i(r, e); + unsafe fn test_mm_mask_cmp_ss_mask() { + let a = _mm_setr_ps(2., 1., 1., 1.); + let b = _mm_setr_ps(1., 2., 2., 2.); + let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b10, a, b); + assert_eq!(m, 0); + let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b1, a, b); + assert_eq!(m, 1); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtt_roundps_epu32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let r = _mm512_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(a); - let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i(r, e); + unsafe fn test_mm_cmp_round_ss_mask() { + let a = _mm_setr_ps(2., 1., 1., 1.); + let b = _mm_setr_ps(1., 2., 2., 2.); + let m = _mm_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b); + assert_eq!(m, 1); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtt_roundps_epu32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let src = _mm512_set1_epi32(0); - let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0, a); - assert_eq_m512i(r, src); - let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a); - let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); - assert_eq_m512i(r, e); + unsafe fn test_mm_mask_cmp_round_ss_mask() { + let a = _mm_setr_ps(2., 1., 1., 1.); + let b = _mm_setr_ps(1., 2., 2., 2.); + let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b); + assert_eq!(m, 0); + let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b); + assert_eq!(m, 1); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtt_roundps_epu32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0, a); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a); - let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); - assert_eq_m512i(r, e); + unsafe fn test_mm_cmp_sd_mask() { + let a = _mm_setr_pd(2., 1.); + let b = _mm_setr_pd(1., 2.); + let m = _mm_cmp_sd_mask::<_CMP_GE_OS>(a, b); + assert_eq!(m, 1); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvttps_epi32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let r = _mm512_cvttps_epi32(a); - let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i(r, e); + unsafe fn test_mm_mask_cmp_sd_mask() { + let a = _mm_setr_pd(2., 1.); + let b = _mm_setr_pd(1., 2.); + let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b10, a, b); + assert_eq!(m, 0); + let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b1, a, b); + assert_eq!(m, 1); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvttps_epi32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let src = _mm512_set1_epi32(0); - let r = _mm512_mask_cvttps_epi32(src, 0, a); - assert_eq_m512i(r, src); - let r = _mm512_mask_cvttps_epi32(src, 0b00000000_11111111, a); - let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0); - assert_eq_m512i(r, e); + unsafe fn test_mm_cmp_round_sd_mask() { + let a = _mm_setr_pd(2., 1.); + let b = _mm_setr_pd(1., 2.); + let m = _mm_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b); + assert_eq!(m, 1); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvttps_epi32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let r = _mm512_maskz_cvttps_epi32(0, a); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_cvttps_epi32(0b00000000_11111111, a); - let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0); - assert_eq_m512i(r, e); + unsafe fn test_mm_mask_cmp_round_sd_mask() { + let a = _mm_setr_pd(2., 1.); + let b = _mm_setr_pd(1., 2.); + let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b); + assert_eq!(m, 0); + let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b); + assert_eq!(m, 1); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvttps_epi32() { - let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5); - let src = _mm256_set1_epi32(0); - let r = _mm256_mask_cvttps_epi32(src, 0, a); - assert_eq_m256i(r, src); - let r = _mm256_mask_cvttps_epi32(src, 0b11111111, a); - let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m256i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmplt_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let m = _mm512_cmplt_epu32_mask(a, b); + assert_eq!(m, 0b11001111_11001111); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvttps_epi32() { - let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5); - let r = _mm256_maskz_cvttps_epi32(0, a); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm256_maskz_cvttps_epi32(0b11111111, a); - let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m256i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmplt_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01111010_01111010; + let r = _mm512_mask_cmplt_epu32_mask(mask, a, b); + assert_eq!(r, 0b01001010_01001010); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvttps_epi32() { - let a = _mm_set_ps(12., 13.5, 14., 15.5); - let src = _mm_set1_epi32(0); - let r = _mm_mask_cvttps_epi32(src, 0, a); - assert_eq_m128i(r, src); - let r = _mm_mask_cvttps_epi32(src, 0b00001111, a); - let e = _mm_set_epi32(12, 13, 14, 15); - assert_eq_m128i(r, e); + unsafe fn test_mm256_cmplt_epu32_mask() { + let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99); + let b = _mm256_set1_epi32(1); + let r = _mm256_cmplt_epu32_mask(a, b); + assert_eq!(r, 0b10000000); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvttps_epi32() { - let a = _mm_set_ps(12., 13.5, 14., 15.5); - let r = _mm_maskz_cvttps_epi32(0, a); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvttps_epi32(0b00001111, a); - let e = _mm_set_epi32(12, 13, 14, 15); - assert_eq_m128i(r, e); + unsafe fn test_mm256_mask_cmplt_epu32_mask() { + let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99); + let b = _mm256_set1_epi32(1); + let mask = 0b11111111; + let r = _mm256_mask_cmplt_epu32_mask(mask, a, b); + assert_eq!(r, 0b10000000); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvttps_epu32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let r = _mm512_cvttps_epu32(a); - let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i(r, e); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cmplt_epu32_mask() { + let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); + let b = _mm_set1_epi32(1); + let r = _mm_cmplt_epu32_mask(a, b); + assert_eq!(r, 0b00001000); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvttps_epu32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let src = _mm512_set1_epi32(0); - let r = _mm512_mask_cvttps_epu32(src, 0, a); - assert_eq_m512i(r, src); - let r = _mm512_mask_cvttps_epu32(src, 0b00000000_11111111, a); - let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); - assert_eq_m512i(r, e); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cmplt_epu32_mask() { + let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); + let b = _mm_set1_epi32(1); + let mask = 0b11111111; + let r = _mm_mask_cmplt_epu32_mask(mask, a, b); + assert_eq!(r, 0b00001000); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvttps_epu32() { - let a = _mm512_setr_ps( - 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, - ); - let r = _mm512_maskz_cvttps_epu32(0, a); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_cvttps_epu32(0b00000000_11111111, a); - let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cvttps_epu32() { - let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5); - let r = _mm256_cvttps_epu32(a); - let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m256i(r, e); + unsafe fn test_mm512_cmpgt_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let m = _mm512_cmpgt_epu32_mask(b, a); + assert_eq!(m, 0b11001111_11001111); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cvttps_epu32() { - let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5); - let src = _mm256_set1_epi32(0); - let r = _mm256_mask_cvttps_epu32(src, 0, a); - assert_eq_m256i(r, src); - let r = _mm256_mask_cvttps_epu32(src, 0b11111111, a); - let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m256i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpgt_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01111010_01111010; + let r = _mm512_mask_cmpgt_epu32_mask(mask, b, a); + assert_eq!(r, 0b01001010_01001010); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_maskz_cvttps_epu32() { - let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5); - let r = _mm256_maskz_cvttps_epu32(0, a); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm256_maskz_cvttps_epu32(0b11111111, a); - let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m256i(r, e); + unsafe fn test_mm256_cmpgt_epu32_mask() { + let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101); + let b = _mm256_set1_epi32(1); + let r = _mm256_cmpgt_epu32_mask(a, b); + assert_eq!(r, 0b00111111); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cvttps_epu32() { - let a = _mm_set_ps(12., 13.5, 14., 15.5); - let r = _mm_cvttps_epu32(a); - let e = _mm_set_epi32(12, 13, 14, 15); - assert_eq_m128i(r, e); + unsafe fn test_mm256_mask_cmpgt_epu32_mask() { + let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101); + let b = _mm256_set1_epi32(1); + let mask = 0b11111111; + let r = _mm256_mask_cmpgt_epu32_mask(mask, a, b); + assert_eq!(r, 0b00111111); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cvttps_epu32() { - let a = _mm_set_ps(12., 13.5, 14., 15.5); - let src = _mm_set1_epi32(0); - let r = _mm_mask_cvttps_epu32(src, 0, a); - assert_eq_m128i(r, src); - let r = _mm_mask_cvttps_epu32(src, 0b00001111, a); - let e = _mm_set_epi32(12, 13, 14, 15); - assert_eq_m128i(r, e); + unsafe fn test_mm_cmpgt_epu32_mask() { + let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); + let b = _mm_set1_epi32(1); + let r = _mm_cmpgt_epu32_mask(a, b); + assert_eq!(r, 0b00000011); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_maskz_cvttps_epu32() { - let a = _mm_set_ps(12., 13.5, 14., 15.5); - let r = _mm_maskz_cvttps_epu32(0, a); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_cvttps_epu32(0b00001111, a); - let e = _mm_set_epi32(12, 13, 14, 15); - assert_eq_m128i(r, e); + unsafe fn test_mm_mask_cmpgt_epu32_mask() { + let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); + let b = _mm_set1_epi32(1); + let mask = 0b11111111; + let r = _mm_mask_cmpgt_epu32_mask(mask, a, b); + assert_eq!(r, 0b00000011); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_i32gather_ps() { - let mut arr = [0f32; 256]; - for i in 0..256 { - arr[i] = i as f32; - } - // A multiplier of 4 is word-addressing - #[rustfmt::skip] - let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, - 120, 128, 136, 144, 152, 160, 168, 176); - let r = _mm512_i32gather_ps::<4>(index, arr.as_ptr() as *const u8); + unsafe fn test_mm512_cmple_epu32_mask() { #[rustfmt::skip] - assert_eq_m512(r, _mm512_setr_ps(0., 16., 32., 48., 64., 80., 96., 112., - 120., 128., 136., 144., 152., 160., 168., 176.)); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + assert_eq!( + _mm512_cmple_epu32_mask(a, b), + !_mm512_cmpgt_epu32_mask(a, b) + ) } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_i32gather_ps() { - let mut arr = [0f32; 256]; - for i in 0..256 { - arr[i] = i as f32; - } - let src = _mm512_set1_ps(2.); - let mask = 0b10101010_10101010; - #[rustfmt::skip] - let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, - 120, 128, 136, 144, 152, 160, 168, 176); - // A multiplier of 4 is word-addressing - let r = _mm512_mask_i32gather_ps::<4>(src, mask, index, arr.as_ptr() as *const u8); + unsafe fn test_mm512_mask_cmple_epu32_mask() { #[rustfmt::skip] - assert_eq_m512(r, _mm512_setr_ps(2., 16., 2., 48., 2., 80., 2., 112., - 2., 128., 2., 144., 2., 160., 2., 176.)); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01111010_01111010; + assert_eq!( + _mm512_mask_cmple_epu32_mask(mask, a, b), + 0b01111010_01111010 + ); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_i32gather_epi32() { - let mut arr = [0i32; 256]; - for i in 0..256 { - arr[i] = i as i32; - } - // A multiplier of 4 is word-addressing - #[rustfmt::skip] - let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, - 120, 128, 136, 144, 152, 160, 168, 176); - let r = _mm512_i32gather_epi32::<4>(index, arr.as_ptr() as *const u8); - #[rustfmt::skip] - assert_eq_m512i(r, _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, - 120, 128, 136, 144, 152, 160, 168, 176)); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_cmple_epu32_mask() { + let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101); + let b = _mm256_set1_epi32(1); + let r = _mm256_cmple_epu32_mask(a, b); + assert_eq!(r, 0b11000000) } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_i32gather_epi32() { - let mut arr = [0i32; 256]; - for i in 0..256 { - arr[i] = i as i32; - } - let src = _mm512_set1_epi32(2); - let mask = 0b10101010_10101010; - let index = _mm512_setr_epi32( - 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, - ); - // A multiplier of 4 is word-addressing - let r = _mm512_mask_i32gather_epi32::<4>(src, mask, index, arr.as_ptr() as *const u8); - assert_eq_m512i( - r, - _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112, 2, 144, 2, 176, 2, 208, 2, 240), - ); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cmple_epu32_mask() { + let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101); + let b = _mm256_set1_epi32(1); + let mask = 0b11111111; + let r = _mm256_mask_cmple_epu32_mask(mask, a, b); + assert_eq!(r, 0b11000000) } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_i32scatter_ps() { - let mut arr = [0f32; 256]; - #[rustfmt::skip] - let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, - 128, 144, 160, 176, 192, 208, 224, 240); - let src = _mm512_setr_ps( - 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., - ); - // A multiplier of 4 is word-addressing - _mm512_i32scatter_ps::<4>(arr.as_mut_ptr() as *mut u8, index, src); - let mut expected = [0f32; 256]; - for i in 0..16 { - expected[i * 16] = (i + 1) as f32; - } - assert_eq!(&arr[..], &expected[..],); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cmple_epu32_mask() { + let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); + let b = _mm_set1_epi32(1); + let r = _mm_cmple_epu32_mask(a, b); + assert_eq!(r, 0b00001100) } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_i32scatter_ps() { - let mut arr = [0f32; 256]; - let mask = 0b10101010_10101010; - #[rustfmt::skip] - let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, - 128, 144, 160, 176, 192, 208, 224, 240); - let src = _mm512_setr_ps( - 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., - ); - // A multiplier of 4 is word-addressing - _mm512_mask_i32scatter_ps::<4>(arr.as_mut_ptr() as *mut u8, mask, index, src); - let mut expected = [0f32; 256]; - for i in 0..8 { - expected[i * 32 + 16] = 2. * (i + 1) as f32; - } - assert_eq!(&arr[..], &expected[..],); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cmple_epu32_mask() { + let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); + let b = _mm_set1_epi32(1); + let mask = 0b11111111; + let r = _mm_mask_cmple_epu32_mask(mask, a, b); + assert_eq!(r, 0b00001100) } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_i32scatter_epi32() { - let mut arr = [0i32; 256]; + unsafe fn test_mm512_cmpge_epu32_mask() { #[rustfmt::skip] - - let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, - 128, 144, 160, 176, 192, 208, 224, 240); - let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - // A multiplier of 4 is word-addressing - _mm512_i32scatter_epi32::<4>(arr.as_mut_ptr() as *mut u8, index, src); - let mut expected = [0i32; 256]; - for i in 0..16 { - expected[i * 16] = (i + 1) as i32; - } - assert_eq!(&arr[..], &expected[..],); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + assert_eq!( + _mm512_cmpge_epu32_mask(a, b), + !_mm512_cmplt_epu32_mask(a, b) + ) } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_i32scatter_epi32() { - let mut arr = [0i32; 256]; - let mask = 0b10101010_10101010; + unsafe fn test_mm512_mask_cmpge_epu32_mask() { #[rustfmt::skip] - let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, - 128, 144, 160, 176, 192, 208, 224, 240); - let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - // A multiplier of 4 is word-addressing - _mm512_mask_i32scatter_epi32::<4>(arr.as_mut_ptr() as *mut u8, mask, index, src); - let mut expected = [0i32; 256]; - for i in 0..8 { - expected[i * 32 + 16] = 2 * (i + 1) as i32; - } - assert_eq!(&arr[..], &expected[..],); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01111010_01111010; + assert_eq!(_mm512_mask_cmpge_epu32_mask(mask, a, b), 0b01100000_0110000); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_cmpge_epu32_mask() { + let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200); + let b = _mm256_set1_epi32(1); + let r = _mm256_cmpge_epu32_mask(a, b); + assert_eq!(r, 0b01111111) + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cmpge_epu32_mask() { + let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200); + let b = _mm256_set1_epi32(1); + let mask = 0b11111111; + let r = _mm256_mask_cmpge_epu32_mask(mask, a, b); + assert_eq!(r, 0b01111111) } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmplt_ps_mask() { - #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., - 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); - let b = _mm512_set1_ps(-1.); - let m = _mm512_cmplt_ps_mask(a, b); - assert_eq!(m, 0b00000101_00000101); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cmpge_epu32_mask() { + let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); + let b = _mm_set1_epi32(1); + let r = _mm_cmpge_epu32_mask(a, b); + assert_eq!(r, 0b00000111) } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmplt_ps_mask() { - #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., - 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); - let b = _mm512_set1_ps(-1.); - let mask = 0b01100110_01100110; - let r = _mm512_mask_cmplt_ps_mask(mask, a, b); - assert_eq!(r, 0b00000100_00000100); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cmpge_epu32_mask() { + let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); + let b = _mm_set1_epi32(1); + let mask = 0b11111111; + let r = _mm_mask_cmpge_epu32_mask(mask, a, b); + assert_eq!(r, 0b00000111) } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmpnlt_ps_mask() { + unsafe fn test_mm512_cmpeq_epu32_mask() { #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., - 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); - let b = _mm512_set1_ps(-1.); - assert_eq!(_mm512_cmpnlt_ps_mask(a, b), !_mm512_cmplt_ps_mask(a, b)); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + #[rustfmt::skip] + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let m = _mm512_cmpeq_epu32_mask(b, a); + assert_eq!(m, 0b11001111_11001111); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmpnlt_ps_mask() { + unsafe fn test_mm512_mask_cmpeq_epu32_mask() { #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., - 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); - let b = _mm512_set1_ps(-1.); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + #[rustfmt::skip] + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); let mask = 0b01111010_01111010; - assert_eq!(_mm512_mask_cmpnlt_ps_mask(mask, a, b), 0b01111010_01111010); + let r = _mm512_mask_cmpeq_epu32_mask(mask, b, a); + assert_eq!(r, 0b01001010_01001010); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmpnle_ps_mask() { - #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., - 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); - let b = _mm512_set1_ps(-1.); - let m = _mm512_cmpnle_ps_mask(b, a); - assert_eq!(m, 0b00001101_00001101); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_cmpeq_epu32_mask() { + let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let m = _mm256_cmpeq_epu32_mask(b, a); + assert_eq!(m, 0b11001111); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmpnle_ps_mask() { - #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., - 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); - let b = _mm512_set1_ps(-1.); - let mask = 0b01100110_01100110; - let r = _mm512_mask_cmpnle_ps_mask(mask, b, a); - assert_eq!(r, 0b00000100_00000100); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cmpeq_epu32_mask() { + let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let mask = 0b01111010; + let r = _mm256_mask_cmpeq_epu32_mask(mask, b, a); + assert_eq!(r, 0b01001010); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmple_ps_mask() { - #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., - 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); - let b = _mm512_set1_ps(-1.); - assert_eq!(_mm512_cmple_ps_mask(a, b), 0b00100101_00100101); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cmpeq_epu32_mask() { + let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32); + let b = _mm_set_epi32(0, 1, 13, 42); + let m = _mm_cmpeq_epu32_mask(b, a); + assert_eq!(m, 0b00001100); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmple_ps_mask() { - #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., - 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); - let b = _mm512_set1_ps(-1.); - let mask = 0b01111010_01111010; - assert_eq!(_mm512_mask_cmple_ps_mask(mask, a, b), 0b00100000_00100000); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cmpeq_epu32_mask() { + let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32); + let b = _mm_set_epi32(0, 1, 13, 42); + let mask = 0b11111111; + let r = _mm_mask_cmpeq_epu32_mask(mask, b, a); + assert_eq!(r, 0b00001100); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmpeq_ps_mask() { + unsafe fn test_mm512_cmpneq_epu32_mask() { #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100., - 0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); #[rustfmt::skip] - let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100., - 0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.); - let m = _mm512_cmpeq_ps_mask(b, a); - assert_eq!(m, 0b11001101_11001101); + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let m = _mm512_cmpneq_epu32_mask(b, a); + assert_eq!(m, !_mm512_cmpeq_epu32_mask(b, a)); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmpeq_ps_mask() { + unsafe fn test_mm512_mask_cmpneq_epu32_mask() { #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100., - 0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100); #[rustfmt::skip] - let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100., - 0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.); + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); let mask = 0b01111010_01111010; - let r = _mm512_mask_cmpeq_ps_mask(mask, b, a); - assert_eq!(r, 0b01001000_01001000); + let r = _mm512_mask_cmpneq_epu32_mask(mask, b, a); + assert_eq!(r, 0b00110010_00110010); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_cmpneq_epu32_mask() { + let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100); + let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100); + let r = _mm256_cmpneq_epu32_mask(b, a); + assert_eq!(r, 0b00110000); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cmpneq_epu32_mask() { + let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100); + let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100); + let mask = 0b11111111; + let r = _mm256_mask_cmpneq_epu32_mask(mask, b, a); + assert_eq!(r, 0b00110000); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cmpneq_epu32_mask() { + let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32); + let b = _mm_set_epi32(0, 1, 13, 42); + let r = _mm_cmpneq_epu32_mask(b, a); + assert_eq!(r, 0b00000011); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cmpneq_epu32_mask() { + let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32); + let b = _mm_set_epi32(0, 1, 13, 42); + let mask = 0b11111111; + let r = _mm_mask_cmpneq_epu32_mask(mask, b, a); + assert_eq!(r, 0b00000011); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmpneq_ps_mask() { - #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100., - 0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.); + unsafe fn test_mm512_cmp_epu32_mask() { #[rustfmt::skip] - let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100., - 0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.); - let m = _mm512_cmpneq_ps_mask(b, a); - assert_eq!(m, 0b00110010_00110010); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let m = _mm512_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b); + assert_eq!(m, 0b11001111_11001111); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmpneq_ps_mask() { - #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100., - 0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.); + unsafe fn test_mm512_mask_cmp_epu32_mask() { #[rustfmt::skip] - let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100., - 0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); let mask = 0b01111010_01111010; - let r = _mm512_mask_cmpneq_ps_mask(mask, b, a); - assert_eq!(r, 0b00110010_00110010) + let r = _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b); + assert_eq!(r, 0b01001010_01001010); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_cmp_epu32_mask() { + let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set1_epi32(-1); + let m = _mm256_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b); + assert_eq!(m, 0b11001111); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cmp_epu32_mask() { + let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set1_epi32(-1); + let mask = 0b11111111; + let r = _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b); + assert_eq!(r, 0b11001111); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cmp_epu32_mask() { + let a = _mm_set_epi32(0, 1, -1, i32::MAX); + let b = _mm_set1_epi32(1); + let m = _mm_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b); + assert_eq!(m, 0b00001000); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cmp_epu32_mask() { + let a = _mm_set_epi32(0, 1, -1, i32::MAX); + let b = _mm_set1_epi32(1); + let mask = 0b11111111; + let r = _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b); + assert_eq!(r, 0b00001000); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmp_ps_mask() { + unsafe fn test_mm512_cmplt_epi32_mask() { #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100., - 0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); - let b = _mm512_set1_ps(-1.); - let m = _mm512_cmp_ps_mask::<_CMP_LT_OQ>(a, b); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let m = _mm512_cmplt_epi32_mask(a, b); assert_eq!(m, 0b00000101_00000101); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmp_ps_mask() { + unsafe fn test_mm512_mask_cmplt_epi32_mask() { #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100., - 0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); - let b = _mm512_set1_ps(-1.); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); let mask = 0b01100110_01100110; - let r = _mm512_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b); + let r = _mm512_mask_cmplt_epi32_mask(mask, a, b); assert_eq!(r, 0b00000100_00000100); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmp_ps_mask() { - let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); - let b = _mm256_set1_ps(-1.); - let m = _mm256_cmp_ps_mask::<_CMP_LT_OQ>(a, b); - assert_eq!(m, 0b00000101); + unsafe fn test_mm256_cmplt_epi32_mask() { + let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set1_epi32(-1); + let r = _mm256_cmplt_epi32_mask(a, b); + assert_eq!(r, 0b00000101); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmp_ps_mask() { - let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); - let b = _mm256_set1_ps(-1.); - let mask = 0b01100110; - let r = _mm256_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b); - assert_eq!(r, 0b00000100); + unsafe fn test_mm256_mask_cmplt_epi32_mask() { + let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set1_epi32(-1); + let mask = 0b11111111; + let r = _mm256_mask_cmplt_epi32_mask(mask, a, b); + assert_eq!(r, 0b00000101); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmp_ps_mask() { - let a = _mm_set_ps(0., 1., -1., 13.); - let b = _mm_set1_ps(1.); - let m = _mm_cmp_ps_mask::<_CMP_LT_OQ>(a, b); - assert_eq!(m, 0b00001010); + unsafe fn test_mm_cmplt_epi32_mask() { + let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100); + let b = _mm_set1_epi32(-1); + let r = _mm_cmplt_epi32_mask(a, b); + assert_eq!(r, 0b00000101); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmp_ps_mask() { - let a = _mm_set_ps(0., 1., -1., 13.); - let b = _mm_set1_ps(1.); + unsafe fn test_mm_mask_cmplt_epi32_mask() { + let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100); + let b = _mm_set1_epi32(-1); let mask = 0b11111111; - let r = _mm_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b); - assert_eq!(r, 0b00001010); + let r = _mm_mask_cmplt_epi32_mask(mask, a, b); + assert_eq!(r, 0b00000101); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmp_round_ps_mask() { + unsafe fn test_mm512_cmpgt_epi32_mask() { #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100., - 0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); - let b = _mm512_set1_ps(-1.); - let m = _mm512_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(a, b); + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let m = _mm512_cmpgt_epi32_mask(b, a); assert_eq!(m, 0b00000101_00000101); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmp_round_ps_mask() { + unsafe fn test_mm512_mask_cmpgt_epi32_mask() { #[rustfmt::skip] - let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100., - 0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); - let b = _mm512_set1_ps(-1.); + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); let mask = 0b01100110_01100110; - let r = _mm512_mask_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(mask, a, b); + let r = _mm512_mask_cmpgt_epi32_mask(mask, b, a); assert_eq!(r, 0b00000100_00000100); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmpord_ps_mask() { - #[rustfmt::skip] - let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0., - f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.); - #[rustfmt::skip] - let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0., - f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.); - let m = _mm512_cmpord_ps_mask(a, b); - assert_eq!(m, 0b00000101_00000101); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_cmpgt_epi32_mask() { + let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set1_epi32(-1); + let r = _mm256_cmpgt_epi32_mask(a, b); + assert_eq!(r, 0b11011010); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cmpgt_epi32_mask() { + let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set1_epi32(-1); + let mask = 0b11111111; + let r = _mm256_mask_cmpgt_epi32_mask(mask, a, b); + assert_eq!(r, 0b11011010); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cmpgt_epi32_mask() { + let a = _mm_set_epi32(0, 1, -1, 13); + let b = _mm_set1_epi32(-1); + let r = _mm_cmpgt_epi32_mask(a, b); + assert_eq!(r, 0b00001101); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cmpgt_epi32_mask() { + let a = _mm_set_epi32(0, 1, -1, 13); + let b = _mm_set1_epi32(-1); + let mask = 0b11111111; + let r = _mm_mask_cmpgt_epi32_mask(mask, a, b); + assert_eq!(r, 0b00001101); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmpord_ps_mask() { - #[rustfmt::skip] - let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0., - f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.); + unsafe fn test_mm512_cmple_epi32_mask() { #[rustfmt::skip] - let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0., - f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.); - let mask = 0b11000011_11000011; - let m = _mm512_mask_cmpord_ps_mask(mask, a, b); - assert_eq!(m, 0b00000001_00000001); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + assert_eq!( + _mm512_cmple_epi32_mask(a, b), + !_mm512_cmpgt_epi32_mask(a, b) + ) } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmpunord_ps_mask() { - #[rustfmt::skip] - let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0., - f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.); + unsafe fn test_mm512_mask_cmple_epi32_mask() { #[rustfmt::skip] - let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0., - f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.); - let m = _mm512_cmpunord_ps_mask(a, b); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01111010_01111010; + assert_eq!(_mm512_mask_cmple_epi32_mask(mask, a, b), 0b01100000_0110000); + } - assert_eq!(m, 0b11111010_11111010); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_cmple_epi32_mask() { + let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set1_epi32(-1); + let r = _mm256_cmple_epi32_mask(a, b); + assert_eq!(r, 0b00100101) + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cmple_epi32_mask() { + let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set1_epi32(-1); + let mask = 0b11111111; + let r = _mm256_mask_cmple_epi32_mask(mask, a, b); + assert_eq!(r, 0b00100101) + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cmple_epi32_mask() { + let a = _mm_set_epi32(0, 1, -1, 200); + let b = _mm_set1_epi32(-1); + let r = _mm_cmple_epi32_mask(a, b); + assert_eq!(r, 0b00000010) + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cmple_epi32_mask() { + let a = _mm_set_epi32(0, 1, -1, 200); + let b = _mm_set1_epi32(-1); + let mask = 0b11111111; + let r = _mm_mask_cmple_epi32_mask(mask, a, b); + assert_eq!(r, 0b00000010) } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmpunord_ps_mask() { - #[rustfmt::skip] - let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0., - f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.); + unsafe fn test_mm512_cmpge_epi32_mask() { #[rustfmt::skip] - let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0., - f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.); - let mask = 0b00001111_00001111; - let m = _mm512_mask_cmpunord_ps_mask(mask, a, b); - assert_eq!(m, 0b000001010_00001010); + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + assert_eq!( + _mm512_cmpge_epi32_mask(a, b), + !_mm512_cmplt_epi32_mask(a, b) + ) } #[simd_test(enable = "avx512f")] - unsafe fn test_mm_cmp_ss_mask() { - let a = _mm_setr_ps(2., 1., 1., 1.); - let b = _mm_setr_ps(1., 2., 2., 2.); - let m = _mm_cmp_ss_mask::<_CMP_GE_OS>(a, b); - assert_eq!(m, 1); + unsafe fn test_mm512_mask_cmpge_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01111010_01111010; + assert_eq!( + _mm512_mask_cmpge_epi32_mask(mask, a, b), + 0b01111010_01111010 + ); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm_mask_cmp_ss_mask() { - let a = _mm_setr_ps(2., 1., 1., 1.); - let b = _mm_setr_ps(1., 2., 2., 2.); - let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b10, a, b); - assert_eq!(m, 0); - let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b1, a, b); - assert_eq!(m, 1); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_cmpge_epi32_mask() { + let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set1_epi32(-1); + let r = _mm256_cmpge_epi32_mask(a, b); + assert_eq!(r, 0b11111010) + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cmpge_epi32_mask() { + let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set1_epi32(-1); + let mask = 0b11111111; + let r = _mm256_mask_cmpge_epi32_mask(mask, a, b); + assert_eq!(r, 0b11111010) + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cmpge_epi32_mask() { + let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32); + let b = _mm_set1_epi32(-1); + let r = _mm_cmpge_epi32_mask(a, b); + assert_eq!(r, 0b00001111) + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cmpge_epi32_mask() { + let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32); + let b = _mm_set1_epi32(-1); + let mask = 0b11111111; + let r = _mm_mask_cmpge_epi32_mask(mask, a, b); + assert_eq!(r, 0b00001111) } #[simd_test(enable = "avx512f")] - unsafe fn test_mm_cmp_round_ss_mask() { - let a = _mm_setr_ps(2., 1., 1., 1.); - let b = _mm_setr_ps(1., 2., 2., 2.); - let m = _mm_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b); - assert_eq!(m, 1); + unsafe fn test_mm512_cmpeq_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + #[rustfmt::skip] + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let m = _mm512_cmpeq_epi32_mask(b, a); + assert_eq!(m, 0b11001111_11001111); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm_mask_cmp_round_ss_mask() { - let a = _mm_setr_ps(2., 1., 1., 1.); - let b = _mm_setr_ps(1., 2., 2., 2.); - let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b); - assert_eq!(m, 0); - let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b); - assert_eq!(m, 1); + unsafe fn test_mm512_mask_cmpeq_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + #[rustfmt::skip] + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let mask = 0b01111010_01111010; + let r = _mm512_mask_cmpeq_epi32_mask(mask, b, a); + assert_eq!(r, 0b01001010_01001010); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm_cmp_sd_mask() { - let a = _mm_setr_pd(2., 1.); - let b = _mm_setr_pd(1., 2.); - let m = _mm_cmp_sd_mask::<_CMP_GE_OS>(a, b); - assert_eq!(m, 1); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_cmpeq_epi32_mask() { + let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let m = _mm256_cmpeq_epi32_mask(b, a); + assert_eq!(m, 0b11001111); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm_mask_cmp_sd_mask() { - let a = _mm_setr_pd(2., 1.); - let b = _mm_setr_pd(1., 2.); - let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b10, a, b); - assert_eq!(m, 0); - let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b1, a, b); - assert_eq!(m, 1); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_cmpeq_epi32_mask() { + let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let mask = 0b01111010; + let r = _mm256_mask_cmpeq_epi32_mask(mask, b, a); + assert_eq!(r, 0b01001010); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm_cmp_round_sd_mask() { - let a = _mm_setr_pd(2., 1.); - let b = _mm_setr_pd(1., 2.); - let m = _mm_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b); - assert_eq!(m, 1); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_cmpeq_epi32_mask() { + let a = _mm_set_epi32(0, 1, -1, 13); + let b = _mm_set_epi32(0, 1, 13, 42); + let m = _mm_cmpeq_epi32_mask(b, a); + assert_eq!(m, 0b00001100); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm_mask_cmp_round_sd_mask() { - let a = _mm_setr_pd(2., 1.); - let b = _mm_setr_pd(1., 2.); - let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b); - assert_eq!(m, 0); - let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b); - assert_eq!(m, 1); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_cmpeq_epi32_mask() { + let a = _mm_set_epi32(0, 1, -1, 13); + let b = _mm_set_epi32(0, 1, 13, 42); + let mask = 0b11111111; + let r = _mm_mask_cmpeq_epi32_mask(mask, b, a); + assert_eq!(r, 0b00001100); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmplt_epu32_mask() { + unsafe fn test_mm512_cmpneq_epi32_mask() { #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - let m = _mm512_cmplt_epu32_mask(a, b); - assert_eq!(m, 0b11001111_11001111); + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + #[rustfmt::skip] + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let m = _mm512_cmpneq_epi32_mask(b, a); + assert_eq!(m, !_mm512_cmpeq_epi32_mask(b, a)); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmplt_epu32_mask() { + unsafe fn test_mm512_mask_cmpneq_epi32_mask() { #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100, + 0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100); + #[rustfmt::skip] + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); let mask = 0b01111010_01111010; - let r = _mm512_mask_cmplt_epu32_mask(mask, a, b); - assert_eq!(r, 0b01001010_01001010); + let r = _mm512_mask_cmpneq_epi32_mask(mask, b, a); + assert_eq!(r, 0b00110010_00110010) } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmplt_epu32_mask() { - let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99); - let b = _mm256_set1_epi32(1); - let r = _mm256_cmplt_epu32_mask(a, b); - assert_eq!(r, 0b10000000); + unsafe fn test_mm256_cmpneq_epi32_mask() { + let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let m = _mm256_cmpneq_epi32_mask(b, a); + assert_eq!(m, !_mm256_cmpeq_epi32_mask(b, a)); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmplt_epu32_mask() { - let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99); - let b = _mm256_set1_epi32(1); + unsafe fn test_mm256_mask_cmpneq_epi32_mask() { + let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100); + let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); let mask = 0b11111111; - let r = _mm256_mask_cmplt_epu32_mask(mask, a, b); - assert_eq!(r, 0b10000000); + let r = _mm256_mask_cmpneq_epi32_mask(mask, b, a); + assert_eq!(r, 0b00110011) } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmplt_epu32_mask() { - let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); - let b = _mm_set1_epi32(1); - let r = _mm_cmplt_epu32_mask(a, b); - assert_eq!(r, 0b00001000); + unsafe fn test_mm_cmpneq_epi32_mask() { + let a = _mm_set_epi32(0, 1, -1, 13); + let b = _mm_set_epi32(0, 1, 13, 42); + let r = _mm_cmpneq_epi32_mask(b, a); + assert_eq!(r, 0b00000011) } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmplt_epu32_mask() { - let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); - let b = _mm_set1_epi32(1); + unsafe fn test_mm_mask_cmpneq_epi32_mask() { + let a = _mm_set_epi32(0, 1, -1, 13); + let b = _mm_set_epi32(0, 1, 13, 42); let mask = 0b11111111; - let r = _mm_mask_cmplt_epu32_mask(mask, a, b); - assert_eq!(r, 0b00001000); + let r = _mm_mask_cmpneq_epi32_mask(mask, b, a); + assert_eq!(r, 0b00000011) } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmpgt_epu32_mask() { + unsafe fn test_mm512_cmp_epi32_mask() { #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); let b = _mm512_set1_epi32(-1); - let m = _mm512_cmpgt_epu32_mask(b, a); - assert_eq!(m, 0b11001111_11001111); + let m = _mm512_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b); + assert_eq!(m, 0b00000101_00000101); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmpgt_epu32_mask() { + unsafe fn test_mm512_mask_cmp_epi32_mask() { #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); let b = _mm512_set1_epi32(-1); - let mask = 0b01111010_01111010; - let r = _mm512_mask_cmpgt_epu32_mask(mask, b, a); - assert_eq!(r, 0b01001010_01001010); + let mask = 0b01100110_01100110; + let r = _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b); + assert_eq!(r, 0b00000100_00000100); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmpgt_epu32_mask() { - let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101); - let b = _mm256_set1_epi32(1); - let r = _mm256_cmpgt_epu32_mask(a, b); - assert_eq!(r, 0b00111111); + unsafe fn test_mm256_cmp_epi32_mask() { + let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set1_epi32(-1); + let m = _mm256_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b); + assert_eq!(m, 0b00000101); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmpgt_epu32_mask() { - let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101); - let b = _mm256_set1_epi32(1); - let mask = 0b11111111; - let r = _mm256_mask_cmpgt_epu32_mask(mask, a, b); - assert_eq!(r, 0b00111111); + unsafe fn test_mm256_mask_cmp_epi32_mask() { + let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + let b = _mm256_set1_epi32(-1); + let mask = 0b01100110; + let r = _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b); + assert_eq!(r, 0b00000100); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmpgt_epu32_mask() { - let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); + unsafe fn test_mm_cmp_epi32_mask() { + let a = _mm_set_epi32(0, 1, -1, 13); let b = _mm_set1_epi32(1); - let r = _mm_cmpgt_epu32_mask(a, b); - assert_eq!(r, 0b00000011); + let m = _mm_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b); + assert_eq!(m, 0b00001010); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmpgt_epu32_mask() { - let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); + unsafe fn test_mm_mask_cmp_epi32_mask() { + let a = _mm_set_epi32(0, 1, -1, 13); let b = _mm_set1_epi32(1); let mask = 0b11111111; - let r = _mm_mask_cmpgt_epu32_mask(mask, a, b); - assert_eq!(r, 0b00000011); + let r = _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b); + assert_eq!(r, 0b00001010); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmple_epu32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - assert_eq!( - _mm512_cmple_epu32_mask(a, b), - !_mm512_cmpgt_epu32_mask(a, b) + unsafe fn test_mm512_set_epi8() { + let r = _mm512_set1_epi8(2); + assert_eq_m512i( + r, + _mm512_set_epi8( + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + ), ) } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmple_epu32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - let mask = 0b01111010_01111010; - assert_eq!( - _mm512_mask_cmple_epu32_mask(mask, a, b), - 0b01111010_01111010 - ); + unsafe fn test_mm512_set_epi16() { + let r = _mm512_set1_epi16(2); + assert_eq_m512i( + r, + _mm512_set_epi16( + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, + ), + ) } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmple_epu32_mask() { - let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101); - let b = _mm256_set1_epi32(1); - let r = _mm256_cmple_epu32_mask(a, b); - assert_eq!(r, 0b11000000) + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_set_epi32() { + let r = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i( + r, + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), + ) } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmple_epu32_mask() { - let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101); - let b = _mm256_set1_epi32(1); - let mask = 0b11111111; - let r = _mm256_mask_cmple_epu32_mask(mask, a, b); - assert_eq!(r, 0b11000000) + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_setr_epi32() { + let r = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i( + r, + _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), + ) } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmple_epu32_mask() { - let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); - let b = _mm_set1_epi32(1); - let r = _mm_cmple_epu32_mask(a, b); - assert_eq!(r, 0b00001100) + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_set1_epi8() { + let r = _mm512_set_epi8( + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, + ); + assert_eq_m512i(r, _mm512_set1_epi8(2)); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmple_epu32_mask() { - let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); - let b = _mm_set1_epi32(1); - let mask = 0b11111111; - let r = _mm_mask_cmple_epu32_mask(mask, a, b); - assert_eq!(r, 0b00001100) + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_set1_epi16() { + let r = _mm512_set_epi16( + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, + ); + assert_eq_m512i(r, _mm512_set1_epi16(2)); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmpge_epu32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - assert_eq!( - _mm512_cmpge_epu32_mask(a, b), - !_mm512_cmplt_epu32_mask(a, b) - ) + unsafe fn test_mm512_set1_epi32() { + let r = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2); + assert_eq_m512i(r, _mm512_set1_epi32(2)); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmpge_epu32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - let mask = 0b01111010_01111010; - assert_eq!(_mm512_mask_cmpge_epu32_mask(mask, a, b), 0b01100000_0110000); + unsafe fn test_mm512_setzero_si512() { + assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_si512()); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmpge_epu32_mask() { - let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200); - let b = _mm256_set1_epi32(1); - let r = _mm256_cmpge_epu32_mask(a, b); - assert_eq!(r, 0b01111111) + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_setzero_epi32() { + assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_epi32()); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmpge_epu32_mask() { - let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200); - let b = _mm256_set1_epi32(1); - let mask = 0b11111111; - let r = _mm256_mask_cmpge_epu32_mask(mask, a, b); - assert_eq!(r, 0b01111111) + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_set_ps() { + let r = _mm512_setr_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + assert_eq_m512( + r, + _mm512_set_ps( + 15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0., + ), + ) } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmpge_epu32_mask() { - let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); - let b = _mm_set1_epi32(1); - let r = _mm_cmpge_epu32_mask(a, b); - assert_eq!(r, 0b00000111) + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_setr_ps() { + let r = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + assert_eq_m512( + r, + _mm512_setr_ps( + 15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0., + ), + ) } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmpge_epu32_mask() { - let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32); - let b = _mm_set1_epi32(1); - let mask = 0b11111111; - let r = _mm_mask_cmpge_epu32_mask(mask, a, b); - assert_eq!(r, 0b00000111) + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_set1_ps() { + #[rustfmt::skip] + let expected = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2., + 2., 2., 2., 2., 2., 2., 2., 2.); + assert_eq_m512(expected, _mm512_set1_ps(2.)); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmpeq_epu32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - #[rustfmt::skip] - let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, - 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); - let m = _mm512_cmpeq_epu32_mask(b, a); - assert_eq!(m, 0b11001111_11001111); + unsafe fn test_mm512_set4_epi32() { + let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1); + assert_eq_m512i(r, _mm512_set4_epi32(4, 3, 2, 1)); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmpeq_epu32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - #[rustfmt::skip] - let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, - 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); - let mask = 0b01111010_01111010; - let r = _mm512_mask_cmpeq_epu32_mask(mask, b, a); - assert_eq!(r, 0b01001010_01001010); + unsafe fn test_mm512_set4_ps() { + let r = _mm512_set_ps( + 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., + ); + assert_eq_m512(r, _mm512_set4_ps(4., 3., 2., 1.)); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmpeq_epu32_mask() { - let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); - let m = _mm256_cmpeq_epu32_mask(b, a); - assert_eq!(m, 0b11001111); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_setr4_epi32() { + let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1); + assert_eq_m512i(r, _mm512_setr4_epi32(1, 2, 3, 4)); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmpeq_epu32_mask() { - let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); - let mask = 0b01111010; - let r = _mm256_mask_cmpeq_epu32_mask(mask, b, a); - assert_eq!(r, 0b01001010); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_setr4_ps() { + let r = _mm512_set_ps( + 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., + ); + assert_eq_m512(r, _mm512_setr4_ps(1., 2., 3., 4.)); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmpeq_epu32_mask() { - let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32); - let b = _mm_set_epi32(0, 1, 13, 42); - let m = _mm_cmpeq_epu32_mask(b, a); - assert_eq!(m, 0b00001100); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_setzero_ps() { + assert_eq_m512(_mm512_setzero_ps(), _mm512_set1_ps(0.)); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmpeq_epu32_mask() { - let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32); - let b = _mm_set_epi32(0, 1, 13, 42); - let mask = 0b11111111; - let r = _mm_mask_cmpeq_epu32_mask(mask, b, a); - assert_eq!(r, 0b00001100); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_setzero() { + assert_eq_m512(_mm512_setzero(), _mm512_set1_ps(0.)); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmpneq_epu32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - #[rustfmt::skip] - let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, - 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); - let m = _mm512_cmpneq_epu32_mask(b, a); - assert_eq!(m, !_mm512_cmpeq_epu32_mask(b, a)); + unsafe fn test_mm512_loadu_pd() { + let a = &[4., 3., 2., 5., 8., 9., 64., 50.]; + let p = a.as_ptr(); + let r = _mm512_loadu_pd(black_box(p)); + let e = _mm512_setr_pd(4., 3., 2., 5., 8., 9., 64., 50.); + assert_eq_m512d(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmpneq_epu32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100); - #[rustfmt::skip] - let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, - 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); - let mask = 0b01111010_01111010; - let r = _mm512_mask_cmpneq_epu32_mask(mask, b, a); - assert_eq!(r, 0b00110010_00110010); + unsafe fn test_mm512_storeu_pd() { + let a = _mm512_set1_pd(9.); + let mut r = _mm512_undefined_pd(); + _mm512_storeu_pd(&mut r as *mut _ as *mut f64, a); + assert_eq_m512d(r, a); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmpneq_epu32_mask() { - let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100); - let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100); - let r = _mm256_cmpneq_epu32_mask(b, a); - assert_eq!(r, 0b00110000); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_loadu_ps() { + let a = &[ + 4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50., + ]; + let p = a.as_ptr(); + let r = _mm512_loadu_ps(black_box(p)); + let e = _mm512_setr_ps( + 4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50., + ); + assert_eq_m512(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmpneq_epu32_mask() { - let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100); - let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100); - let mask = 0b11111111; - let r = _mm256_mask_cmpneq_epu32_mask(mask, b, a); - assert_eq!(r, 0b00110000); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_storeu_ps() { + let a = _mm512_set1_ps(9.); + let mut r = _mm512_undefined_ps(); + _mm512_storeu_ps(&mut r as *mut _ as *mut f32, a); + assert_eq_m512(r, a); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmpneq_epu32_mask() { - let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32); - let b = _mm_set_epi32(0, 1, 13, 42); - let r = _mm_cmpneq_epu32_mask(b, a); - assert_eq!(r, 0b00000011); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_loadu_epi32() { + let src = _mm512_set1_epi32(42); + let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_mask_loadu_epi32(src, m, black_box(p)); + let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16); + assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmpneq_epu32_mask() { - let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32); - let b = _mm_set_epi32(0, 1, 13, 42); - let mask = 0b11111111; - let r = _mm_mask_cmpneq_epu32_mask(mask, b, a); - assert_eq!(r, 0b00000011); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_loadu_epi32() { + let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_maskz_loadu_epi32(m, black_box(p)); + let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16); + assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmp_epu32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - let m = _mm512_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b11001111_11001111); + unsafe fn test_mm512_mask_load_epi32() { + #[repr(align(64))] + struct Align { + data: [i32; 16], // 64 bytes + } + let src = _mm512_set1_epi32(42); + let a = Align { + data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + }; + let p = a.data.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_mask_load_epi32(src, m, black_box(p)); + let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16); + assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmp_epu32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - let mask = 0b01111010_01111010; - let r = _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b01001010_01001010); + unsafe fn test_mm512_maskz_load_epi32() { + #[repr(align(64))] + struct Align { + data: [i32; 16], // 64 bytes + } + let a = Align { + data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + }; + let p = a.data.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_maskz_load_epi32(m, black_box(p)); + let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16); + assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmp_epu32_mask() { - let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set1_epi32(-1); - let m = _mm256_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b11001111); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_storeu_epi32() { + let mut r = [42_i32; 16]; + let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let m = 0b11101000_11001010; + _mm512_mask_storeu_epi32(r.as_mut_ptr(), m, a); + let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16); + assert_eq_m512i(_mm512_loadu_epi32(r.as_ptr()), e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmp_epu32_mask() { - let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set1_epi32(-1); - let mask = 0b11111111; - let r = _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b11001111); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_store_epi32() { + #[repr(align(64))] + struct Align { + data: [i32; 16], + } + let mut r = Align { data: [42; 16] }; + let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let m = 0b11101000_11001010; + _mm512_mask_store_epi32(r.data.as_mut_ptr(), m, a); + let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16); + assert_eq_m512i(_mm512_load_epi32(r.data.as_ptr()), e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmp_epu32_mask() { - let a = _mm_set_epi32(0, 1, -1, i32::MAX); - let b = _mm_set1_epi32(1); - let m = _mm_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b00001000); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_loadu_epi64() { + let src = _mm512_set1_epi64(42); + let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm512_mask_loadu_epi64(src, m, black_box(p)); + let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmp_epu32_mask() { - let a = _mm_set_epi32(0, 1, -1, i32::MAX); - let b = _mm_set1_epi32(1); - let mask = 0b11111111; - let r = _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b00001000); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_loadu_epi64() { + let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm512_maskz_loadu_epi64(m, black_box(p)); + let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8); + assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmplt_epi32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - let m = _mm512_cmplt_epi32_mask(a, b); - assert_eq!(m, 0b00000101_00000101); + unsafe fn test_mm512_mask_load_epi64() { + #[repr(align(64))] + struct Align { + data: [i64; 8], // 64 bytes + } + let src = _mm512_set1_epi64(42); + let a = Align { + data: [1_i64, 2, 3, 4, 5, 6, 7, 8], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm512_mask_load_epi64(src, m, black_box(p)); + let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmplt_epi32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - let mask = 0b01100110_01100110; - let r = _mm512_mask_cmplt_epi32_mask(mask, a, b); - assert_eq!(r, 0b00000100_00000100); + unsafe fn test_mm512_maskz_load_epi64() { + #[repr(align(64))] + struct Align { + data: [i64; 8], // 64 bytes + } + let a = Align { + data: [1_i64, 2, 3, 4, 5, 6, 7, 8], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm512_maskz_load_epi64(m, black_box(p)); + let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8); + assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmplt_epi32_mask() { - let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set1_epi32(-1); - let r = _mm256_cmplt_epi32_mask(a, b); - assert_eq!(r, 0b00000101); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_storeu_epi64() { + let mut r = [42_i64; 8]; + let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let m = 0b11001010; + _mm512_mask_storeu_epi64(r.as_mut_ptr(), m, a); + let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m512i(_mm512_loadu_epi64(r.as_ptr()), e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmplt_epi32_mask() { - let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set1_epi32(-1); - let mask = 0b11111111; - let r = _mm256_mask_cmplt_epi32_mask(mask, a, b); - assert_eq!(r, 0b00000101); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_store_epi64() { + #[repr(align(64))] + struct Align { + data: [i64; 8], + } + let mut r = Align { data: [42; 8] }; + let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let m = 0b11001010; + let p = r.data.as_mut_ptr(); + _mm512_mask_store_epi64(p, m, a); + let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m512i(_mm512_load_epi64(r.data.as_ptr()), e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmplt_epi32_mask() { - let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100); - let b = _mm_set1_epi32(-1); - let r = _mm_cmplt_epi32_mask(a, b); - assert_eq!(r, 0b00000101); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_loadu_ps() { + let src = _mm512_set1_ps(42.0); + let a = &[ + 1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 16.0, + ]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_mask_loadu_ps(src, m, black_box(p)); + let e = _mm512_setr_ps( + 42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0, + 16.0, + ); + assert_eq_m512(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmplt_epi32_mask() { - let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100); - let b = _mm_set1_epi32(-1); - let mask = 0b11111111; - let r = _mm_mask_cmplt_epi32_mask(mask, a, b); - assert_eq!(r, 0b00000101); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_loadu_ps() { + let a = &[ + 1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 16.0, + ]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_maskz_loadu_ps(m, black_box(p)); + let e = _mm512_setr_ps( + 0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0, + ); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmpgt_epi32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - let m = _mm512_cmpgt_epi32_mask(b, a); - assert_eq!(m, 0b00000101_00000101); + unsafe fn test_mm512_mask_load_ps() { + #[repr(align(64))] + struct Align { + data: [f32; 16], // 64 bytes + } + let src = _mm512_set1_ps(42.0); + let a = Align { + data: [ + 1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, + 15.0, 16.0, + ], + }; + let p = a.data.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_mask_load_ps(src, m, black_box(p)); + let e = _mm512_setr_ps( + 42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0, + 16.0, + ); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmpgt_epi32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - let mask = 0b01100110_01100110; - let r = _mm512_mask_cmpgt_epi32_mask(mask, b, a); - assert_eq!(r, 0b00000100_00000100); + unsafe fn test_mm512_maskz_load_ps() { + #[repr(align(64))] + struct Align { + data: [f32; 16], // 64 bytes + } + let a = Align { + data: [ + 1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, + 15.0, 16.0, + ], + }; + let p = a.data.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_maskz_load_ps(m, black_box(p)); + let e = _mm512_setr_ps( + 0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0, + ); + assert_eq_m512(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmpgt_epi32_mask() { - let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set1_epi32(-1); - let r = _mm256_cmpgt_epi32_mask(a, b); - assert_eq!(r, 0b11011010); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_storeu_ps() { + let mut r = [42_f32; 16]; + let a = _mm512_setr_ps( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let m = 0b11101000_11001010; + _mm512_mask_storeu_ps(r.as_mut_ptr(), m, a); + let e = _mm512_setr_ps( + 42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0, + 16.0, + ); + assert_eq_m512(_mm512_loadu_ps(r.as_ptr()), e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmpgt_epi32_mask() { - let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set1_epi32(-1); - let mask = 0b11111111; - let r = _mm256_mask_cmpgt_epi32_mask(mask, a, b); - assert_eq!(r, 0b11011010); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_store_ps() { + #[repr(align(64))] + struct Align { + data: [f32; 16], + } + let mut r = Align { data: [42.0; 16] }; + let a = _mm512_setr_ps( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let m = 0b11101000_11001010; + _mm512_mask_store_ps(r.data.as_mut_ptr(), m, a); + let e = _mm512_setr_ps( + 42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0, + 16.0, + ); + assert_eq_m512(_mm512_load_ps(r.data.as_ptr()), e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmpgt_epi32_mask() { - let a = _mm_set_epi32(0, 1, -1, 13); - let b = _mm_set1_epi32(-1); - let r = _mm_cmpgt_epi32_mask(a, b); - assert_eq!(r, 0b00001101); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_loadu_pd() { + let src = _mm512_set1_pd(42.0); + let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm512_mask_loadu_pd(src, m, black_box(p)); + let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m512d(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmpgt_epi32_mask() { - let a = _mm_set_epi32(0, 1, -1, 13); - let b = _mm_set1_epi32(-1); - let mask = 0b11111111; - let r = _mm_mask_cmpgt_epi32_mask(mask, a, b); - assert_eq!(r, 0b00001101); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_loadu_pd() { + let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm512_maskz_loadu_pd(m, black_box(p)); + let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0); + assert_eq_m512d(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmple_epi32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - assert_eq!( - _mm512_cmple_epi32_mask(a, b), - !_mm512_cmpgt_epi32_mask(a, b) - ) + unsafe fn test_mm512_mask_load_pd() { + #[repr(align(64))] + struct Align { + data: [f64; 8], // 64 bytes + } + let src = _mm512_set1_pd(42.0); + let a = Align { + data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm512_mask_load_pd(src, m, black_box(p)); + let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_load_pd() { + #[repr(align(64))] + struct Align { + data: [f64; 8], // 64 bytes + } + let a = Align { + data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm512_maskz_load_pd(m, black_box(p)); + let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0); + assert_eq_m512d(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmple_epi32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - let mask = 0b01111010_01111010; - assert_eq!(_mm512_mask_cmple_epi32_mask(mask, a, b), 0b01100000_0110000); + unsafe fn test_mm512_mask_storeu_pd() { + let mut r = [42_f64; 8]; + let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let m = 0b11001010; + _mm512_mask_storeu_pd(r.as_mut_ptr(), m, a); + let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m512d(_mm512_loadu_pd(r.as_ptr()), e); } - #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmple_epi32_mask() { - let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set1_epi32(-1); - let r = _mm256_cmple_epi32_mask(a, b); - assert_eq!(r, 0b00100101) + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_store_pd() { + #[repr(align(64))] + struct Align { + data: [f64; 8], + } + let mut r = Align { data: [42.0; 8] }; + let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let m = 0b11001010; + _mm512_mask_store_pd(r.data.as_mut_ptr(), m, a); + let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m512d(_mm512_load_pd(r.data.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmple_epi32_mask() { - let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set1_epi32(-1); - let mask = 0b11111111; - let r = _mm256_mask_cmple_epi32_mask(mask, a, b); - assert_eq!(r, 0b00100101) + unsafe fn test_mm256_mask_loadu_epi32() { + let src = _mm256_set1_epi32(42); + let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm256_mask_loadu_epi32(src, m, black_box(p)); + let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmple_epi32_mask() { - let a = _mm_set_epi32(0, 1, -1, 200); - let b = _mm_set1_epi32(-1); - let r = _mm_cmple_epi32_mask(a, b); - assert_eq!(r, 0b00000010) + unsafe fn test_mm256_maskz_loadu_epi32() { + let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm256_maskz_loadu_epi32(m, black_box(p)); + let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8); + assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmple_epi32_mask() { - let a = _mm_set_epi32(0, 1, -1, 200); - let b = _mm_set1_epi32(-1); - let mask = 0b11111111; - let r = _mm_mask_cmple_epi32_mask(mask, a, b); - assert_eq!(r, 0b00000010) - } - - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmpge_epi32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - assert_eq!( - _mm512_cmpge_epi32_mask(a, b), - !_mm512_cmplt_epi32_mask(a, b) - ) + unsafe fn test_mm256_mask_load_epi32() { + #[repr(align(32))] + struct Align { + data: [i32; 8], // 32 bytes + } + let src = _mm256_set1_epi32(42); + let a = Align { + data: [1_i32, 2, 3, 4, 5, 6, 7, 8], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm256_mask_load_epi32(src, m, black_box(p)); + let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmpge_epi32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - let mask = 0b01111010_01111010; - assert_eq!( - _mm512_mask_cmpge_epi32_mask(mask, a, b), - 0b01111010_01111010 - ); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_load_epi32() { + #[repr(align(32))] + struct Align { + data: [i32; 8], // 32 bytes + } + let a = Align { + data: [1_i32, 2, 3, 4, 5, 6, 7, 8], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm256_maskz_load_epi32(m, black_box(p)); + let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8); + assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmpge_epi32_mask() { - let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set1_epi32(-1); - let r = _mm256_cmpge_epi32_mask(a, b); - assert_eq!(r, 0b11111010) + unsafe fn test_mm256_mask_storeu_epi32() { + let mut r = [42_i32; 8]; + let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let m = 0b11001010; + _mm256_mask_storeu_epi32(r.as_mut_ptr(), m, a); + let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m256i(_mm256_loadu_epi32(r.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmpge_epi32_mask() { - let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set1_epi32(-1); - let mask = 0b11111111; - let r = _mm256_mask_cmpge_epi32_mask(mask, a, b); - assert_eq!(r, 0b11111010) + unsafe fn test_mm256_mask_store_epi32() { + #[repr(align(64))] + struct Align { + data: [i32; 8], + } + let mut r = Align { data: [42; 8] }; + let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let m = 0b11001010; + _mm256_mask_store_epi32(r.data.as_mut_ptr(), m, a); + let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m256i(_mm256_load_epi32(r.data.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmpge_epi32_mask() { - let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32); - let b = _mm_set1_epi32(-1); - let r = _mm_cmpge_epi32_mask(a, b); - assert_eq!(r, 0b00001111) + unsafe fn test_mm256_mask_loadu_epi64() { + let src = _mm256_set1_epi64x(42); + let a = &[1_i64, 2, 3, 4]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm256_mask_loadu_epi64(src, m, black_box(p)); + let e = _mm256_setr_epi64x(42, 2, 42, 4); + assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmpge_epi32_mask() { - let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32); - let b = _mm_set1_epi32(-1); - let mask = 0b11111111; - let r = _mm_mask_cmpge_epi32_mask(mask, a, b); - assert_eq!(r, 0b00001111) + unsafe fn test_mm256_maskz_loadu_epi64() { + let a = &[1_i64, 2, 3, 4]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm256_maskz_loadu_epi64(m, black_box(p)); + let e = _mm256_setr_epi64x(0, 2, 0, 4); + assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmpeq_epi32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); - #[rustfmt::skip] - let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, - 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); - let m = _mm512_cmpeq_epi32_mask(b, a); - assert_eq!(m, 0b11001111_11001111); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_load_epi64() { + #[repr(align(32))] + struct Align { + data: [i64; 4], // 32 bytes + } + let src = _mm256_set1_epi64x(42); + let a = Align { + data: [1_i64, 2, 3, 4], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm256_mask_load_epi64(src, m, black_box(p)); + let e = _mm256_setr_epi64x(42, 2, 42, 4); + assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmpeq_epi32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); - #[rustfmt::skip] - let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, - 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); - let mask = 0b01111010_01111010; - let r = _mm512_mask_cmpeq_epi32_mask(mask, b, a); - assert_eq!(r, 0b01001010_01001010); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_load_epi64() { + #[repr(align(32))] + struct Align { + data: [i64; 4], // 32 bytes + } + let a = Align { + data: [1_i64, 2, 3, 4], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm256_maskz_load_epi64(m, black_box(p)); + let e = _mm256_setr_epi64x(0, 2, 0, 4); + assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmpeq_epi32_mask() { - let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); - let m = _mm256_cmpeq_epi32_mask(b, a); - assert_eq!(m, 0b11001111); + unsafe fn test_mm256_mask_storeu_epi64() { + let mut r = [42_i64; 4]; + let a = _mm256_setr_epi64x(1, 2, 3, 4); + let m = 0b1010; + _mm256_mask_storeu_epi64(r.as_mut_ptr(), m, a); + let e = _mm256_setr_epi64x(42, 2, 42, 4); + assert_eq_m256i(_mm256_loadu_epi64(r.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmpeq_epi32_mask() { - let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); - let mask = 0b01111010; - let r = _mm256_mask_cmpeq_epi32_mask(mask, b, a); - assert_eq!(r, 0b01001010); + unsafe fn test_mm256_mask_store_epi64() { + #[repr(align(32))] + struct Align { + data: [i64; 4], + } + let mut r = Align { data: [42; 4] }; + let a = _mm256_setr_epi64x(1, 2, 3, 4); + let m = 0b1010; + _mm256_mask_store_epi64(r.data.as_mut_ptr(), m, a); + let e = _mm256_setr_epi64x(42, 2, 42, 4); + assert_eq_m256i(_mm256_load_epi64(r.data.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmpeq_epi32_mask() { - let a = _mm_set_epi32(0, 1, -1, 13); - let b = _mm_set_epi32(0, 1, 13, 42); - let m = _mm_cmpeq_epi32_mask(b, a); - assert_eq!(m, 0b00001100); + unsafe fn test_mm256_mask_loadu_ps() { + let src = _mm256_set1_ps(42.0); + let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm256_mask_loadu_ps(src, m, black_box(p)); + let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m256(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmpeq_epi32_mask() { - let a = _mm_set_epi32(0, 1, -1, 13); - let b = _mm_set_epi32(0, 1, 13, 42); - let mask = 0b11111111; - let r = _mm_mask_cmpeq_epi32_mask(mask, b, a); - assert_eq!(r, 0b00001100); + unsafe fn test_mm256_maskz_loadu_ps() { + let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm256_maskz_loadu_ps(m, black_box(p)); + let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0); + assert_eq_m256(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmpneq_epi32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); - #[rustfmt::skip] - let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, - 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); - let m = _mm512_cmpneq_epi32_mask(b, a); - assert_eq!(m, !_mm512_cmpeq_epi32_mask(b, a)); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_load_ps() { + #[repr(align(32))] + struct Align { + data: [f32; 8], // 32 bytes + } + let src = _mm256_set1_ps(42.0); + let a = Align { + data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm256_mask_load_ps(src, m, black_box(p)); + let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m256(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmpneq_epi32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100, - 0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100); - #[rustfmt::skip] - let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, - 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); - let mask = 0b01111010_01111010; - let r = _mm512_mask_cmpneq_epi32_mask(mask, b, a); - assert_eq!(r, 0b00110010_00110010) + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_load_ps() { + #[repr(align(32))] + struct Align { + data: [f32; 8], // 32 bytes + } + let a = Align { + data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm256_maskz_load_ps(m, black_box(p)); + let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0); + assert_eq_m256(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmpneq_epi32_mask() { - let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); - let m = _mm256_cmpneq_epi32_mask(b, a); - assert_eq!(m, !_mm256_cmpeq_epi32_mask(b, a)); + unsafe fn test_mm256_mask_storeu_ps() { + let mut r = [42_f32; 8]; + let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let m = 0b11001010; + _mm256_mask_storeu_ps(r.as_mut_ptr(), m, a); + let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m256(_mm256_loadu_ps(r.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmpneq_epi32_mask() { - let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100); - let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); - let mask = 0b11111111; - let r = _mm256_mask_cmpneq_epi32_mask(mask, b, a); - assert_eq!(r, 0b00110011) + unsafe fn test_mm256_mask_store_ps() { + #[repr(align(32))] + struct Align { + data: [f32; 8], + } + let mut r = Align { data: [42.0; 8] }; + let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let m = 0b11001010; + _mm256_mask_store_ps(r.data.as_mut_ptr(), m, a); + let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m256(_mm256_load_ps(r.data.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmpneq_epi32_mask() { - let a = _mm_set_epi32(0, 1, -1, 13); - let b = _mm_set_epi32(0, 1, 13, 42); - let r = _mm_cmpneq_epi32_mask(b, a); - assert_eq!(r, 0b00000011) + unsafe fn test_mm256_mask_loadu_pd() { + let src = _mm256_set1_pd(42.0); + let a = &[1.0_f64, 2.0, 3.0, 4.0]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm256_mask_loadu_pd(src, m, black_box(p)); + let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0); + assert_eq_m256d(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmpneq_epi32_mask() { - let a = _mm_set_epi32(0, 1, -1, 13); - let b = _mm_set_epi32(0, 1, 13, 42); - let mask = 0b11111111; - let r = _mm_mask_cmpneq_epi32_mask(mask, b, a); - assert_eq!(r, 0b00000011) + unsafe fn test_mm256_maskz_loadu_pd() { + let a = &[1.0_f64, 2.0, 3.0, 4.0]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm256_maskz_loadu_pd(m, black_box(p)); + let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0); + assert_eq_m256d(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cmp_epi32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - let m = _mm512_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b00000101_00000101); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_load_pd() { + #[repr(align(32))] + struct Align { + data: [f64; 4], // 32 bytes + } + let src = _mm256_set1_pd(42.0); + let a = Align { + data: [1.0_f64, 2.0, 3.0, 4.0], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm256_mask_load_pd(src, m, black_box(p)); + let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0); + assert_eq_m256d(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cmp_epi32_mask() { - #[rustfmt::skip] - let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, - 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); - let b = _mm512_set1_epi32(-1); - let mask = 0b01100110_01100110; - let r = _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b00000100_00000100); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_load_pd() { + #[repr(align(32))] + struct Align { + data: [f64; 4], // 32 bytes + } + let a = Align { + data: [1.0_f64, 2.0, 3.0, 4.0], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm256_maskz_load_pd(m, black_box(p)); + let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0); + assert_eq_m256d(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_cmp_epi32_mask() { - let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set1_epi32(-1); - let m = _mm256_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b00000101); + unsafe fn test_mm256_mask_storeu_pd() { + let mut r = [42_f64; 4]; + let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0); + let m = 0b1010; + _mm256_mask_storeu_pd(r.as_mut_ptr(), m, a); + let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0); + assert_eq_m256d(_mm256_loadu_pd(r.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm256_mask_cmp_epi32_mask() { - let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); - let b = _mm256_set1_epi32(-1); - let mask = 0b01100110; - let r = _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b00000100); + unsafe fn test_mm256_mask_store_pd() { + #[repr(align(32))] + struct Align { + data: [f64; 4], + } + let mut r = Align { data: [42.0; 4] }; + let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0); + let m = 0b1010; + _mm256_mask_store_pd(r.data.as_mut_ptr(), m, a); + let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0); + assert_eq_m256d(_mm256_load_pd(r.data.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_cmp_epi32_mask() { - let a = _mm_set_epi32(0, 1, -1, 13); - let b = _mm_set1_epi32(1); - let m = _mm_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b00001010); + unsafe fn test_mm_mask_loadu_epi32() { + let src = _mm_set1_epi32(42); + let a = &[1_i32, 2, 3, 4]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm_mask_loadu_epi32(src, m, black_box(p)); + let e = _mm_setr_epi32(42, 2, 42, 4); + assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f,avx512vl")] - unsafe fn test_mm_mask_cmp_epi32_mask() { - let a = _mm_set_epi32(0, 1, -1, 13); - let b = _mm_set1_epi32(1); - let mask = 0b11111111; - let r = _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b00001010); + unsafe fn test_mm_maskz_loadu_epi32() { + let a = &[1_i32, 2, 3, 4]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm_maskz_loadu_epi32(m, black_box(p)); + let e = _mm_setr_epi32(0, 2, 0, 4); + assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_set_epi8() { - let r = _mm512_set1_epi8(2); - assert_eq_m512i( - r, - _mm512_set_epi8( - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, - ), - ) + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_load_epi32() { + #[repr(align(16))] + struct Align { + data: [i32; 4], // 32 bytes + } + let src = _mm_set1_epi32(42); + let a = Align { + data: [1_i32, 2, 3, 4], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm_mask_load_epi32(src, m, black_box(p)); + let e = _mm_setr_epi32(42, 2, 42, 4); + assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_set_epi16() { - let r = _mm512_set1_epi16(2); - assert_eq_m512i( - r, - _mm512_set_epi16( - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, - ), - ) + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_load_epi32() { + #[repr(align(16))] + struct Align { + data: [i32; 4], // 16 bytes + } + let a = Align { + data: [1_i32, 2, 3, 4], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm_maskz_load_epi32(m, black_box(p)); + let e = _mm_setr_epi32(0, 2, 0, 4); + assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_set_epi32() { - let r = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i( - r, - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), - ) + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_storeu_epi32() { + let mut r = [42_i32; 4]; + let a = _mm_setr_epi32(1, 2, 3, 4); + let m = 0b1010; + _mm_mask_storeu_epi32(r.as_mut_ptr(), m, a); + let e = _mm_setr_epi32(42, 2, 42, 4); + assert_eq_m128i(_mm_loadu_epi32(r.as_ptr()), e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_setr_epi32() { - let r = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq_m512i( - r, - _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), - ) + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_store_epi32() { + #[repr(align(16))] + struct Align { + data: [i32; 4], // 16 bytes + } + let mut r = Align { data: [42; 4] }; + let a = _mm_setr_epi32(1, 2, 3, 4); + let m = 0b1010; + _mm_mask_store_epi32(r.data.as_mut_ptr(), m, a); + let e = _mm_setr_epi32(42, 2, 42, 4); + assert_eq_m128i(_mm_load_epi32(r.data.as_ptr()), e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_set1_epi8() { - let r = _mm512_set_epi8( - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, - ); - assert_eq_m512i(r, _mm512_set1_epi8(2)); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_loadu_epi64() { + let src = _mm_set1_epi64x(42); + let a = &[1_i64, 2]; + let p = a.as_ptr(); + let m = 0b10; + let r = _mm_mask_loadu_epi64(src, m, black_box(p)); + let e = _mm_setr_epi64x(42, 2); + assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_set1_epi16() { - let r = _mm512_set_epi16( - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, - ); - assert_eq_m512i(r, _mm512_set1_epi16(2)); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_loadu_epi64() { + let a = &[1_i64, 2]; + let p = a.as_ptr(); + let m = 0b10; + let r = _mm_maskz_loadu_epi64(m, black_box(p)); + let e = _mm_setr_epi64x(0, 2); + assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_set1_epi32() { - let r = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2); - assert_eq_m512i(r, _mm512_set1_epi32(2)); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_load_epi64() { + #[repr(align(16))] + struct Align { + data: [i64; 2], // 16 bytes + } + let src = _mm_set1_epi64x(42); + let a = Align { data: [1_i64, 2] }; + let p = a.data.as_ptr(); + let m = 0b10; + let r = _mm_mask_load_epi64(src, m, black_box(p)); + let e = _mm_setr_epi64x(42, 2); + assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_setzero_si512() { - assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_si512()); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_load_epi64() { + #[repr(align(16))] + struct Align { + data: [i64; 2], // 16 bytes + } + let a = Align { data: [1_i64, 2] }; + let p = a.data.as_ptr(); + let m = 0b10; + let r = _mm_maskz_load_epi64(m, black_box(p)); + let e = _mm_setr_epi64x(0, 2); + assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_setzero_epi32() { - assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_epi32()); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_storeu_epi64() { + let mut r = [42_i64; 2]; + let a = _mm_setr_epi64x(1, 2); + let m = 0b10; + _mm_mask_storeu_epi64(r.as_mut_ptr(), m, a); + let e = _mm_setr_epi64x(42, 2); + assert_eq_m128i(_mm_loadu_epi64(r.as_ptr()), e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_set_ps() { - let r = _mm512_setr_ps( - 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., - ); - assert_eq_m512( - r, - _mm512_set_ps( - 15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0., - ), - ) + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_store_epi64() { + #[repr(align(16))] + struct Align { + data: [i64; 2], // 16 bytes + } + let mut r = Align { data: [42; 2] }; + let a = _mm_setr_epi64x(1, 2); + let m = 0b10; + _mm_mask_store_epi64(r.data.as_mut_ptr(), m, a); + let e = _mm_setr_epi64x(42, 2); + assert_eq_m128i(_mm_load_epi64(r.data.as_ptr()), e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_setr_ps() { - let r = _mm512_set_ps( - 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., - ); - assert_eq_m512( - r, - _mm512_setr_ps( - 15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0., - ), - ) + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_loadu_ps() { + let src = _mm_set1_ps(42.0); + let a = &[1.0_f32, 2.0, 3.0, 4.0]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm_mask_loadu_ps(src, m, black_box(p)); + let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0); + assert_eq_m128(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_set1_ps() { - #[rustfmt::skip] - let expected = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2., - 2., 2., 2., 2., 2., 2., 2., 2.); - assert_eq_m512(expected, _mm512_set1_ps(2.)); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_loadu_ps() { + let a = &[1.0_f32, 2.0, 3.0, 4.0]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm_maskz_loadu_ps(m, black_box(p)); + let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0); + assert_eq_m128(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_set4_epi32() { - let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1); - assert_eq_m512i(r, _mm512_set4_epi32(4, 3, 2, 1)); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_load_ps() { + #[repr(align(16))] + struct Align { + data: [f32; 4], // 16 bytes + } + let src = _mm_set1_ps(42.0); + let a = Align { + data: [1.0_f32, 2.0, 3.0, 4.0], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm_mask_load_ps(src, m, black_box(p)); + let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0); + assert_eq_m128(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_set4_ps() { - let r = _mm512_set_ps( - 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., - ); - assert_eq_m512(r, _mm512_set4_ps(4., 3., 2., 1.)); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_load_ps() { + #[repr(align(16))] + struct Align { + data: [f32; 4], // 16 bytes + } + let a = Align { + data: [1.0_f32, 2.0, 3.0, 4.0], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm_maskz_load_ps(m, black_box(p)); + let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0); + assert_eq_m128(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_setr4_epi32() { - let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1); - assert_eq_m512i(r, _mm512_setr4_epi32(1, 2, 3, 4)); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_storeu_ps() { + let mut r = [42_f32; 4]; + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let m = 0b1010; + _mm_mask_storeu_ps(r.as_mut_ptr(), m, a); + let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0); + assert_eq_m128(_mm_loadu_ps(r.as_ptr()), e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_setr4_ps() { - let r = _mm512_set_ps( - 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., - ); - assert_eq_m512(r, _mm512_setr4_ps(1., 2., 3., 4.)); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_store_ps() { + #[repr(align(16))] + struct Align { + data: [f32; 4], // 16 bytes + } + let mut r = Align { data: [42.0; 4] }; + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let m = 0b1010; + _mm_mask_store_ps(r.data.as_mut_ptr(), m, a); + let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0); + assert_eq_m128(_mm_load_ps(r.data.as_ptr()), e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_setzero_ps() { - assert_eq_m512(_mm512_setzero_ps(), _mm512_set1_ps(0.)); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_loadu_pd() { + let src = _mm_set1_pd(42.0); + let a = &[1.0_f64, 2.0]; + let p = a.as_ptr(); + let m = 0b10; + let r = _mm_mask_loadu_pd(src, m, black_box(p)); + let e = _mm_setr_pd(42.0, 2.0); + assert_eq_m128d(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_setzero() { - assert_eq_m512(_mm512_setzero(), _mm512_set1_ps(0.)); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_loadu_pd() { + let a = &[1.0_f64, 2.0]; + let p = a.as_ptr(); + let m = 0b10; + let r = _mm_maskz_loadu_pd(m, black_box(p)); + let e = _mm_setr_pd(0.0, 2.0); + assert_eq_m128d(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_loadu_pd() { - let a = &[4., 3., 2., 5., 8., 9., 64., 50.]; - let p = a.as_ptr(); - let r = _mm512_loadu_pd(black_box(p)); - let e = _mm512_setr_pd(4., 3., 2., 5., 8., 9., 64., 50.); - assert_eq_m512d(r, e); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_load_pd() { + #[repr(align(16))] + struct Align { + data: [f64; 2], // 16 bytes + } + let src = _mm_set1_pd(42.0); + let a = Align { + data: [1.0_f64, 2.0], + }; + let p = a.data.as_ptr(); + let m = 0b10; + let r = _mm_mask_load_pd(src, m, black_box(p)); + let e = _mm_setr_pd(42.0, 2.0); + assert_eq_m128d(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_storeu_pd() { - let a = _mm512_set1_pd(9.); - let mut r = _mm512_undefined_pd(); - _mm512_storeu_pd(&mut r as *mut _ as *mut f64, a); - assert_eq_m512d(r, a); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_load_pd() { + #[repr(align(16))] + struct Align { + data: [f64; 2], // 16 bytes + } + let a = Align { + data: [1.0_f64, 2.0], + }; + let p = a.data.as_ptr(); + let m = 0b10; + let r = _mm_maskz_load_pd(m, black_box(p)); + let e = _mm_setr_pd(0.0, 2.0); + assert_eq_m128d(r, e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_loadu_ps() { - let a = &[ - 4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50., - ]; - let p = a.as_ptr(); - let r = _mm512_loadu_ps(black_box(p)); - let e = _mm512_setr_ps( - 4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50., - ); - assert_eq_m512(r, e); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_storeu_pd() { + let mut r = [42_f64; 2]; + let a = _mm_setr_pd(1.0, 2.0); + let m = 0b10; + _mm_mask_storeu_pd(r.as_mut_ptr(), m, a); + let e = _mm_setr_pd(42.0, 2.0); + assert_eq_m128d(_mm_loadu_pd(r.as_ptr()), e); } - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_storeu_ps() { - let a = _mm512_set1_ps(9.); - let mut r = _mm512_undefined_ps(); - _mm512_storeu_ps(&mut r as *mut _ as *mut f32, a); - assert_eq_m512(r, a); + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_store_pd() { + #[repr(align(16))] + struct Align { + data: [f64; 2], // 16 bytes + } + let mut r = Align { data: [42.0; 2] }; + let a = _mm_setr_pd(1.0, 2.0); + let m = 0b10; + _mm_mask_store_pd(r.data.as_mut_ptr(), m, a); + let e = _mm_setr_pd(42.0, 2.0); + assert_eq_m128d(_mm_load_pd(r.data.as_ptr()), e); } #[simd_test(enable = "avx512f")]