diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 1ad80147cf..9d95f0c492 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1784,113 +1784,113 @@
   * [x] [`_mm512_setzero_si512`]
   * [x] [`_mm512_setzero`]
   * [x] [`_mm512_load_epi32`]
-  * [ ] [`_mm512_mask_load_epi32`] //need i1
-  * [ ] [`_mm512_maskz_load_epi32`] //need i1
+  * [x] [`_mm512_mask_load_epi32`] //need i1
+  * [x] [`_mm512_maskz_load_epi32`] //need i1
   * [x] [`_mm_load_epi32`]
-  * [_] [`_mm_mask_load_epi32`] //need i1
-  * [_] [`_mm_maskz_load_epi32`] //need i1
+  * [x] [`_mm_mask_load_epi32`] //need i1
+  * [x] [`_mm_maskz_load_epi32`] //need i1
   * [x] [`_mm256_load_epi32`]
-  * [_] [`_mm256_mask_load_epi32`] //need i1
-  * [_] [`_mm256_maskz_load_epi32`] //need i1
+  * [x] [`_mm256_mask_load_epi32`] //need i1
+  * [x] [`_mm256_maskz_load_epi32`] //need i1
   * [x] [`_mm512_load_epi64`]
-  * [ ] [`_mm512_mask_load_epi64`] //need i1
-  * [ ] [`_mm512_maskz_load_epi64`] //need i1
+  * [x] [`_mm512_mask_load_epi64`] //need i1
+  * [x] [`_mm512_maskz_load_epi64`] //need i1
   * [x] [`_mm_load_epi64`] //need i1
-  * [_] [`_mm_mask_load_epi64`] //need i1
-  * [_] [`_mm_maskz_load_epi64`] //need i1
+  * [x] [`_mm_mask_load_epi64`] //need i1
+  * [x] [`_mm_maskz_load_epi64`] //need i1
   * [x] [`_mm256_load_epi64`] //need i1
-  * [_] [`_mm256_mask_load_epi64`] //need i1
-  * [_] [`_mm256_maskz_load_epi64`] //need i1
+  * [x] [`_mm256_mask_load_epi64`] //need i1
+  * [x] [`_mm256_maskz_load_epi64`] //need i1
   * [x] [`_mm512_load_ps`]
-  * [ ] [`_mm512_mask_load_ps`] //need i1
-  * [ ] [`_mm512_maskz_load_ps`] //need i1
-  * [_] [`_mm_maskz_load_ps`] //need i
-  * [_] [`_mm_mask_load_ps`] //need i1
-  * [_] [`_mm_maskz_load_ps`] //need i1
-  * [_] [`_mm256_mask_load_ps`] //need i1
-  * [_] [`_mm256_maskz_load_ps`] //need i1
+  * [x] [`_mm512_mask_load_ps`] //need i1
+  * [x] [`_mm512_maskz_load_ps`] //need i1
+  * [x] [`_mm_maskz_load_ps`] //need i
+  * [x] [`_mm_mask_load_ps`] //need i1
+  * [x] [`_mm_maskz_load_ps`] //need i1
+  * [x] [`_mm256_mask_load_ps`] //need i1
+  * [x] [`_mm256_maskz_load_ps`] //need i1
   * [x] [`_mm512_load_pd`]
-  * [ ] [`_mm512_mask_load_pd`] //need i1
-  * [ ] [`_mm512_maskz_load_pd`] //need i1
-  * [_] [`_mm_mask_load_pd`] //need i1
-  * [_] [`_mm_maskz_load_pd`] //need i1
-  * [_] [`_mm256_mask_load_pd`] //need i1
-  * [_] [`_mm256_maskz_load_pd`] //need i1
+  * [x] [`_mm512_mask_load_pd`] //need i1
+  * [x] [`_mm512_maskz_load_pd`] //need i1
+  * [x] [`_mm_mask_load_pd`] //need i1
+  * [x] [`_mm_maskz_load_pd`] //need i1
+  * [x] [`_mm256_mask_load_pd`] //need i1
+  * [x] [`_mm256_maskz_load_pd`] //need i1
   * [x] [`_mm512_load_si512`]
   * [x] [`_mm512_loadu_epi32`]
-  * [ ] [`_mm512_mask_loadu_epi32`] //need i1
+  * [x] [`_mm512_mask_loadu_epi32`] //need i1
   * [x] [`_mm_loadu_epi32`]
-  * [_] [`_mm_mask_loadu_epi32`] //need i1
-  * [_] [`_mm_maskz_loadu_epi32`] //need i1
-  * [ ] [`_mm512_maskz_loadu_epi32`] //need i1
+  * [x] [`_mm_mask_loadu_epi32`] //need i1
+  * [x] [`_mm_maskz_loadu_epi32`] //need i1
+  * [x] [`_mm512_maskz_loadu_epi32`] //need i1
   * [x] [`_mm256_loadu_epi32`]
-  * [_] [`_mm256_mask_loadu_epi32`] //need i1
-  * [_] [`_mm256_maskz_loadu_epi32`] //need i1
+  * [x] [`_mm256_mask_loadu_epi32`] //need i1
+  * [x] [`_mm256_maskz_loadu_epi32`] //need i1
   * [x] [`_mm512_loadu_epi64`]
-  * [ ] [`_mm512_mask_loadu_epi64`] //need i1
-  * [ ] [`_mm512_maskz_loadu_epi64`] //need i1
+  * [x] [`_mm512_mask_loadu_epi64`] //need i1
+  * [x] [`_mm512_maskz_loadu_epi64`] //need i1
   * [x] [`_mm_loadu_epi64`]
-  * [_] [`_mm_mask_loadu_epi64`] //need i1
-  * [_] [`_mm_maskz_loadu_epi64`] //need i1
+  * [x] [`_mm_mask_loadu_epi64`] //need i1
+  * [x] [`_mm_maskz_loadu_epi64`] //need i1
   * [x] [`_mm256_loadu_epi64`]
-  * [_] [`_mm256_mask_loadu_epi64`] //need i1
-  * [_] [`_mm256_maskz_loadu_epi64`] //need i1
+  * [x] [`_mm256_mask_loadu_epi64`] //need i1
+  * [x] [`_mm256_maskz_loadu_epi64`] //need i1
   * [x] [`_mm512_loadu_ps`]
-  * [ ] [`_mm512_mask_loadu_ps`] //need i1
-  * [ ] [`_mm512_maskz_loadu_ps`] //need i1
-  * [_] [`_mm_mask_loadu_ps`] //need i1
-  * [_] [`_mm_maskz_loadu_ps`] //need i1
-  * [_] [`_mm256_mask_loadu_ps`] //need i1
-  * [_] [`_mm256_maskz_loadu_ps`] //need i1
+  * [x] [`_mm512_mask_loadu_ps`] //need i1
+  * [x] [`_mm512_maskz_loadu_ps`] //need i1
+  * [x] [`_mm_mask_loadu_ps`] //need i1
+  * [x] [`_mm_maskz_loadu_ps`] //need i1
+  * [x] [`_mm256_mask_loadu_ps`] //need i1
+  * [x] [`_mm256_maskz_loadu_ps`] //need i1
   * [x] [`_mm512_loadu_pd`]
-  * [ ] [`_mm512_mask_loadu_pd`] //need i1
-  * [ ] [`_mm512_maskz_loadu_pd`] //need i1
-  * [_] [`_mm_mask_loadu_pd`] //need i1
-  * [_] [`_mm_maskz_loadu_pd`] //need i1
-  * [_] [`_mm256_mask_loadu_pd`] //need i1
-  * [_] [`_mm256_maskz_loadu_pd`] //need i1
+  * [x] [`_mm512_mask_loadu_pd`] //need i1
+  * [x] [`_mm512_maskz_loadu_pd`] //need i1
+  * [x] [`_mm_mask_loadu_pd`] //need i1
+  * [x] [`_mm_maskz_loadu_pd`] //need i1
+  * [x] [`_mm256_mask_loadu_pd`] //need i1
+  * [x] [`_mm256_maskz_loadu_pd`] //need i1
   * [x] [`_mm512_loadu_si512`]
   * [x] [`_mm512_store_epi32`]
-  * [ ] [`_mm512_mask_store_epi32`] //need i1
-  * [_] [`_mm_mask_store_epi32`] //need i1
+  * [x] [`_mm512_mask_store_epi32`] //need i1
+  * [x] [`_mm_mask_store_epi32`] //need i1
   * [x] [`_mm_store_epi32`]
-  * [_] [`_mm256_mask_store_epi32`] //need i1
+  * [x] [`_mm256_mask_store_epi32`] //need i1
   * [x] [`_mm256_store_epi32`]
   * [x] [`_mm512_store_epi64`]
-  * [ ] [`_mm512_mask_store_epi64`] //need i1
-  * [_] [`_mm_mask_store_epi64`] //need i1
+  * [x] [`_mm512_mask_store_epi64`] //need i1
+  * [x] [`_mm_mask_store_epi64`] //need i1
   * [x] [`_mm_store_epi64`]
-  * [_] [`_mm256_mask_store_epi64`] //need i1
+  * [x] [`_mm256_mask_store_epi64`] //need i1
   * [x] [`_mm256_store_epi64`]
   * [x] [`_mm512_store_ps`]
-  * [ ] [`_mm512_mask_store_ps`] //need i1
-  * [_] [`_mm_mask_store_ps`] //need i1
-  * [_] [`_mm256_mask_store_ps`] //need i1
+  * [x] [`_mm512_mask_store_ps`] //need i1
+  * [x] [`_mm_mask_store_ps`] //need i1
+  * [x] [`_mm256_mask_store_ps`] //need i1
   * [x] [`_mm512_store_pd`]
-  * [ ] [`_mm512_mask_store_pd`] //need i1
-  * [_] [`_mm_mask_store_pd`] //need i1
-  * [_] [`_mm256_mask_store_pd`] //need i1
+  * [x] [`_mm512_mask_store_pd`] //need i1
+  * [x] [`_mm_mask_store_pd`] //need i1
+  * [x] [`_mm256_mask_store_pd`] //need i1
   * [x] [`_mm512_store_si512`]
   * [x] [`_mm512_storeu_epi32`]
-  * [ ] [`_mm512_mask_storeu_epi32`] //need i1
-  * [_] [`_mm_mask_storeu_epi32`] //need i1
+  * [x] [`_mm512_mask_storeu_epi32`] //need i1
+  * [x] [`_mm_mask_storeu_epi32`] //need i1
   * [x] [`_mm_storeu_epi32`]
-  * [_] [`_mm256_mask_storeu_epi32`] //need i1
+  * [x] [`_mm256_mask_storeu_epi32`] //need i1
   * [x] [`_mm256_storeu_epi32`]
   * [x] [`_mm512_storeu_epi64`]
-  * [ ] [`_mm512_mask_storeu_epi64`] //need i1
-  * [_] [`_mm_mask_storeu_epi64`] //need i1
+  * [x] [`_mm512_mask_storeu_epi64`] //need i1
+  * [x] [`_mm_mask_storeu_epi64`] //need i1
   * [x] [`_mm_storeu_epi64`]
-  * [_] [`_mm256_mask_storeu_epi64`] //need i1
+  * [x] [`_mm256_mask_storeu_epi64`] //need i1
   * [x] [`_mm256_storeu_epi64`]
   * [x] [`_mm512_storeu_ps`]
-  * [ ] [`_mm512_mask_storeu_ps`] //need i1
-  * [_] [`_mm_mask_storeu_ps`] //need i1
-  * [_] [`_mm256_mask_storeu_ps`] //need i1
+  * [x] [`_mm512_mask_storeu_ps`] //need i1
+  * [x] [`_mm_mask_storeu_ps`] //need i1
+  * [x] [`_mm256_mask_storeu_ps`] //need i1
   * [x] [`_mm512_storeu_pd`]
-  * [ ] [`_mm512_mask_storeu_pd`] //need i1
-  * [_] [`_mm_mask_storeu_pd`] //need i1
-  * [_] [`_mm256_mask_storeu_pd`] //need i1
+  * [x] [`_mm512_mask_storeu_pd`] //need i1
+  * [x] [`_mm_mask_storeu_pd`] //need i1
+  * [x] [`_mm256_mask_storeu_pd`] //need i1
   * [x] [`_mm512_storeu_si512`]
   * [ ] [`_mm512_stream_load_si512`] //stream_load_si256, ... not implment yet
   * [x] [`_mm512_stream_pd`]
diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 10e0096339..0363004674 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -4227,6 +4227,330 @@ pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) {
     ptr::write_unaligned(mem_addr as *mut __m128i, a);
 }
 
+/// Load packed 16-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *const i16) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+         "vmovdqu16 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 16-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+         "vmovdqu16 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 8-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *const i8) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+         "vmovdqu8 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 8-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+         "vmovdqu8 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 16-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
+pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *const i16) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+         "vmovdqu16 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 16-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+         "vmovdqu16 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 8-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
+pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *const i8) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+         "vmovdqu8 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 8-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+         "vmovdqu8 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 16-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i16) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+         "vmovdqu16 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 16-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+         "vmovdqu16 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 8-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i8) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+         "vmovdqu8 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Load packed 8-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+         "vmovdqu8 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
+}
+
+/// Store packed 16-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: __m512i) {
+    asm!(
+         "vmovdqu16 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(zmm_reg) a,
+         options(nostack)
+    );
+}
+
+/// Store packed 8-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m512i) {
+    asm!(
+         "vmovdqu8 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(zmm_reg) a,
+         options(nostack)
+    );
+}
+
+/// Store packed 16-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
+pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: __m256i) {
+    asm!(
+         "vmovdqu16 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(ymm_reg) a,
+         options(nostack)
+    );
+}
+
+/// Store packed 8-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")]
+pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m256i) {
+    asm!(
+         "vmovdqu8 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(ymm_reg) a,
+         options(nostack)
+    );
+}
+
+/// Store packed 16-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi16)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m128i) {
+    asm!(
+         "vmovdqu16 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(xmm_reg) a,
+         options(nostack)
+    );
+}
+
+/// Store packed 8-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi8)
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128i) {
+    asm!(
+         "vmovdqu8 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(xmm_reg) a,
+         options(nostack)
+    );
+}
+
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_madd_epi16&expand=3511)
@@ -13826,6 +14150,284 @@ mod tests {
         assert_eq_m128i(r, a);
     }
 
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_loadu_epi16() {
+        let src = _mm512_set1_epi16(42);
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm512_mask_loadu_epi16(src, m, black_box(p));
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm512_loadu_epi16(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_maskz_loadu_epi16() {
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm512_maskz_loadu_epi16(m, black_box(p));
+        let e = &[
+            0_i16, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
+            26, 0, 28, 0, 30, 0, 32,
+        ];
+        let e = _mm512_loadu_epi16(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_storeu_epi16() {
+        let mut r = [42_i16; 32];
+        let a = &[
+            1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let a = _mm512_loadu_epi16(a.as_ptr());
+        let m = 0b10101010_11001100_11101000_11001010;
+        _mm512_mask_storeu_epi16(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm512_loadu_epi16(e.as_ptr());
+        assert_eq_m512i(_mm512_loadu_epi16(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_loadu_epi8() {
+        let src = _mm512_set1_epi8(42);
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let p = a.as_ptr();
+        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
+        let r = _mm512_mask_loadu_epi8(src, m, black_box(p));
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32, 42, 42, 42, 42, 42, 42, 42, 42, 41, 42, 43, 44,
+            45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 42, 42, 42, 42, 42, 42, 42, 42,
+        ];
+        let e = _mm512_loadu_epi8(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_maskz_loadu_epi8() {
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let p = a.as_ptr();
+        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
+        let r = _mm512_maskz_loadu_epi8(m, black_box(p));
+        let e = &[
+            0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
+            26, 0, 28, 0, 30, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+            50, 51, 52, 53, 54, 55, 56, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
+        let e = _mm512_loadu_epi8(e.as_ptr());
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw")]
+    unsafe fn test_mm512_mask_storeu_epi8() {
+        let mut r = [42_i8; 64];
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+        ];
+        let a = _mm512_loadu_epi8(a.as_ptr());
+        let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010;
+        _mm512_mask_storeu_epi8(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32, 42, 42, 42, 42, 42, 42, 42, 42, 41, 42, 43, 44,
+            45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 42, 42, 42, 42, 42, 42, 42, 42,
+        ];
+        let e = _mm512_loadu_epi8(e.as_ptr());
+        assert_eq_m512i(_mm512_loadu_epi8(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_epi16() {
+        let src = _mm256_set1_epi16(42);
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm256_mask_loadu_epi16(src, m, black_box(p));
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm256_loadu_epi16(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_epi16() {
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm256_maskz_loadu_epi16(m, black_box(p));
+        let e = &[0_i16, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16];
+        let e = _mm256_loadu_epi16(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_epi16() {
+        let mut r = [42_i16; 16];
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let a = _mm256_loadu_epi16(a.as_ptr());
+        let m = 0b11101000_11001010;
+        _mm256_mask_storeu_epi16(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm256_loadu_epi16(e.as_ptr());
+        assert_eq_m256i(_mm256_loadu_epi16(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_loadu_epi8() {
+        let src = _mm256_set1_epi8(42);
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm256_mask_loadu_epi8(src, m, black_box(p));
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm256_loadu_epi8(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_maskz_loadu_epi8() {
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let p = a.as_ptr();
+        let m = 0b10101010_11001100_11101000_11001010;
+        let r = _mm256_maskz_loadu_epi8(m, black_box(p));
+        let e = &[
+            0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0,
+            26, 0, 28, 0, 30, 0, 32,
+        ];
+        let e = _mm256_loadu_epi8(e.as_ptr());
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_storeu_epi8() {
+        let mut r = [42_i8; 32];
+        let a = &[
+            1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let a = _mm256_loadu_epi8(a.as_ptr());
+        let m = 0b10101010_11001100_11101000_11001010;
+        _mm256_mask_storeu_epi8(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42,
+            23, 24, 42, 26, 42, 28, 42, 30, 42, 32,
+        ];
+        let e = _mm256_loadu_epi8(e.as_ptr());
+        assert_eq_m256i(_mm256_loadu_epi8(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi16() {
+        let src = _mm_set1_epi16(42);
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm_mask_loadu_epi16(src, m, black_box(p));
+        let e = &[42_i16, 2, 42, 4, 42, 42, 7, 8];
+        let e = _mm_loadu_epi16(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi16() {
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm_maskz_loadu_epi16(m, black_box(p));
+        let e = &[0_i16, 2, 0, 4, 0, 0, 7, 8];
+        let e = _mm_loadu_epi16(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi16() {
+        let mut r = [42_i16; 8];
+        let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8];
+        let a = _mm_loadu_epi16(a.as_ptr());
+        let m = 0b11001010;
+        _mm_mask_storeu_epi16(r.as_mut_ptr(), m, a);
+        let e = &[42_i16, 2, 42, 4, 42, 42, 7, 8];
+        let e = _mm_loadu_epi16(e.as_ptr());
+        assert_eq_m128i(_mm_loadu_epi16(r.as_ptr()), e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi8() {
+        let src = _mm_set1_epi8(42);
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm_mask_loadu_epi8(src, m, black_box(p));
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm_loadu_epi8(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi8() {
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm_maskz_loadu_epi8(m, black_box(p));
+        let e = &[0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16];
+        let e = _mm_loadu_epi8(e.as_ptr());
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi8() {
+        let mut r = [42_i8; 16];
+        let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let a = _mm_loadu_epi8(a.as_ptr());
+        let m = 0b11101000_11001010;
+        _mm_mask_storeu_epi8(r.as_mut_ptr(), m, a);
+        let e = &[
+            42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16,
+        ];
+        let e = _mm_loadu_epi8(e.as_ptr());
+        assert_eq_m128i(_mm_loadu_epi8(r.as_ptr()), e);
+    }
+
     #[simd_test(enable = "avx512bw")]
     unsafe fn test_mm512_madd_epi16() {
         let a = _mm512_set1_epi16(1);
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 766acf46f7..7633442aa6 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -30323,2842 +30323,4138 @@ pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) {
     ptr::write(mem_addr as *mut __m512d, a);
 }
 
-/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_pd&expand=5002)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi32)
 #[inline]
 #[target_feature(enable = "avx512f")]
-pub unsafe fn _mm512_setr_pd(
-    e0: f64,
-    e1: f64,
-    e2: f64,
-    e3: f64,
-    e4: f64,
-    e5: f64,
-    e6: f64,
-    e7: f64,
-) -> __m512d {
-    let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
-    transmute(r)
+pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+         "vmovdqu32 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_pd&expand=4924)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi32)
 #[inline]
 #[target_feature(enable = "avx512f")]
-pub unsafe fn _mm512_set_pd(
-    e0: f64,
-    e1: f64,
-    e2: f64,
-    e3: f64,
-    e4: f64,
-    e5: f64,
-    e6: f64,
-    e7: f64,
-) -> __m512d {
-    _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
+pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+         "vmovdqu32 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_ss&expand=3832)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi64)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovss))]
-pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract(src, 0);
-    let mut mov: f32 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        mov = simd_extract(b, 0);
-    }
-    let r = simd_insert(a, 0, mov);
-    transmute(r)
+pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+         "vmovdqu64 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_ss&expand=3833)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi64)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovss))]
-pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let mut mov: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        mov = simd_extract(b, 0);
-    }
-    let r = simd_insert(a, 0, mov);
-    transmute(r)
+pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+         "vmovdqu64 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_sd&expand=3829)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_ps)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovsd))]
-pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract(src, 0);
-    let mut mov: f64 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        mov = simd_extract(b, 0);
-    }
-    let r = simd_insert(a, 0, mov);
-    transmute(r)
+pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 {
+    let mut dst: __m512 = src;
+    asm!(
+         "vmovups {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_sd&expand=3830)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_ps)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovsd))]
-pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let mut mov: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        mov = simd_extract(b, 0);
-    }
-    let r = simd_insert(a, 0, mov);
-    transmute(r)
+pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
+    let mut dst: __m512;
+    asm!(
+         "vmovups {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_ss&expand=159)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_pd)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss))]
-pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract(src, 0);
-    let mut add: f32 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
-        add = extracta + extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d {
+    let mut dst: __m512d = src;
+    asm!(
+         "vmovupd {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_ss&expand=160)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_pd)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss))]
-pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let mut add: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
-        add = extracta + extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
+    let mut dst: __m512d;
+    asm!(
+         "vmovupd {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_sd&expand=155)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi32)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd))]
-pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract(src, 0);
-    let mut add: f64 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        add = extracta + extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+         "vmovdqu32 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_sd&expand=156)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi32)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd))]
-pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let mut add: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        add = extracta + extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+         "vmovdqu32 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_ss&expand=5750)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi64)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss))]
-pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract(src, 0);
-    let mut add: f32 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
-        add = extracta - extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+         "vmovdqu64 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_ss&expand=5751)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi64)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss))]
-pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let mut add: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
-        add = extracta - extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+         "vmovdqu64 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_sd&expand=5746)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_ps)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd))]
-pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract(src, 0);
-    let mut add: f64 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        add = extracta - extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
+    let mut dst: __m256 = src;
+    asm!(
+         "vmovups {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_sd&expand=5747)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_ps)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd))]
-pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let mut add: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        add = extracta - extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
+    let mut dst: __m256;
+    asm!(
+         "vmovups {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_ss&expand=3950)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_pd)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss))]
-pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract(src, 0);
-    let mut add: f32 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
-        add = extracta * extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d {
+    let mut dst: __m256d = src;
+    asm!(
+         "vmovupd {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_ss&expand=3951)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_pd)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss))]
-pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let mut add: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
-        add = extracta * extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
+    let mut dst: __m256d;
+    asm!(
+         "vmovupd {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_sd&expand=3947)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi32)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd))]
-pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract(src, 0);
-    let mut add: f64 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        add = extracta * extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+         "vmovdqu32 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_sd&expand=3948)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi32)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd))]
-pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let mut add: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        add = extracta * extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+         "vmovdqu32 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_ss&expand=2181)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi64)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss))]
-pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract(src, 0);
-    let mut add: f32 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
-        add = extracta / extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+         "vmovdqu64 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_ss&expand=2182)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi64)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss))]
-pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let mut add: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
-        add = extracta / extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+         "vmovdqu64 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_sd&expand=2178)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_ps)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd))]
-pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract(src, 0);
-    let mut add: f64 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        add = extracta / extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128 = src;
+    asm!(
+         "vmovups {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_sd&expand=2179)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_ps)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd))]
-pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let mut add: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        add = extracta / extractb;
-    }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128;
+    asm!(
+         "vmovups {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_pd)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss))]
-pub unsafe fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vmaxss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        src.as_f32x4(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d = src;
+    asm!(
+         "vmovupd {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_ss&expand=3673)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_pd)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss))]
-pub unsafe fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vmaxss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        _mm_setzero_ps().as_f32x4(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d;
+    asm!(
+         "vmovupd {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_sd&expand=3669)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi32)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd))]
-pub unsafe fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vmaxsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        src.as_f64x2(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+         "vmovdqa32 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi32)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd))]
-pub unsafe fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vmaxsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        _mm_setzero_pd().as_f64x2(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+         "vmovdqa32 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_ss&expand=3786)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi64)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss))]
-pub unsafe fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vminss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        src.as_f32x4(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i {
+    let mut dst: __m512i = src;
+    asm!(
+         "vmovdqa64 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_ss&expand=3787)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi64)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss))]
-pub unsafe fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vminss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        _mm_setzero_ps().as_f32x4(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
+    let mut dst: __m512i;
+    asm!(
+         "vmovdqa64 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_sd&expand=3783)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_ps)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd))]
-pub unsafe fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vminsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        src.as_f64x2(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 {
+    let mut dst: __m512 = src;
+    asm!(
+         "vmovaps {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_sd&expand=3784)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_ps)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd))]
-pub unsafe fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vminsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        _mm_setzero_pd().as_f64x2(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
+    let mut dst: __m512;
+    asm!(
+         "vmovaps {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_ss&expand=5387)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_pd)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss))]
-pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vsqrtss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        src.as_f32x4(),
-        k,
-        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
-    ))
+pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d {
+    let mut dst: __m512d = src;
+    asm!(
+         "vmovapd {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_ss&expand=5388)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_pd)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss))]
-pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vsqrtss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        _mm_setzero_ps().as_f32x4(),
-        k,
-        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
-    ))
+pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
+    let mut dst: __m512d;
+    asm!(
+         "vmovapd {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(zmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_sd&expand=5384)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_epi32)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd))]
-pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vsqrtsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        src.as_f64x2(),
-        k,
-        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+         "vmovdqa32 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_sd&expand=5385)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_epi32)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd))]
-pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vsqrtsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        _mm_setzero_pd().as_f64x2(),
-        k,
-        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+         "vmovdqa32 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_ss&expand=4825)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_epi64)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrsqrt14ss))]
-pub unsafe fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 {
-    transmute(vrsqrt14ss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        _mm_setzero_ps().as_f32x4(),
-        0b1,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i {
+    let mut dst: __m256i = src;
+    asm!(
+         "vmovdqa64 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_ss&expand=4823)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_epi64)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrsqrt14ss))]
-pub unsafe fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
+    let mut dst: __m256i;
+    asm!(
+         "vmovdqa64 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_ss&expand=4824)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_ps)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrsqrt14ss))]
-pub unsafe fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vrsqrt14ss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        _mm_setzero_ps().as_f32x4(),
-        k,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
+    let mut dst: __m256 = src;
+    asm!(
+         "vmovaps {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_sd&expand=4822)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_ps)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrsqrt14sd))]
-pub unsafe fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d {
-    transmute(vrsqrt14sd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        _mm_setzero_pd().as_f64x2(),
-        0b1,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
+    let mut dst: __m256;
+    asm!(
+         "vmovaps {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_sd&expand=4820)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_pd)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrsqrt14sd))]
-pub unsafe fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d {
+    let mut dst: __m256d = src;
+    asm!(
+         "vmovapd {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_sd&expand=4821)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_pd)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrsqrt14sd))]
-pub unsafe fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vrsqrt14sd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        _mm_setzero_pd().as_f64x2(),
-        k,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
+    let mut dst: __m256d;
+    asm!(
+         "vmovapd {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(ymm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+/// Load packed 32-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_ss&expand=4508)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_epi32)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrcp14ss))]
-pub unsafe fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 {
-    transmute(vrcp14ss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        _mm_setzero_ps().as_f32x4(),
-        0b1,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+         "vmovdqa32 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+/// Load packed 32-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_ss&expand=4506)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_epi32)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrcp14ss))]
-pub unsafe fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+         "vmovdqa32 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+/// Load packed 64-bit integers from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_ss&expand=4507)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_epi64)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrcp14ss))]
-pub unsafe fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vrcp14ss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        _mm_setzero_ps().as_f32x4(),
-        k,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i {
+    let mut dst: __m128i = src;
+    asm!(
+         "vmovdqa64 {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+/// Load packed 64-bit integers from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_sd&expand=4505)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_epi64)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrcp14sd))]
-pub unsafe fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d {
-    transmute(vrcp14sd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        _mm_setzero_pd().as_f64x2(),
-        0b1,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
+    let mut dst: __m128i;
+    asm!(
+         "vmovdqa64 {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_sd&expand=4503)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ps)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrcp14sd))]
-pub unsafe fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128 = src;
+    asm!(
+         "vmovaps {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_sd&expand=4504)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ps)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrcp14sd))]
-pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vrcp14sd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        _mm_setzero_pd().as_f64x2(),
-        k,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
+    let mut dst: __m128;
+    asm!(
+         "vmovaps {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
+/// (elements are copied from src when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_ss&expand=2862)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_pd)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpss))]
-pub unsafe fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 {
-    transmute(vgetexpss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        _mm_setzero_ps().as_f32x4(),
-        0b1,
-        _MM_FROUND_NO_EXC,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d = src;
+    asm!(
+         "vmovapd {2}{{{1}}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         inout(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
+/// (elements are zeroed out when the corresponding mask bit is not set).
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_ss&expand=2863)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_pd)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpss))]
-pub unsafe fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vgetexpss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        src.as_f32x4(),
-        k,
-        _MM_FROUND_NO_EXC,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
+    let mut dst: __m128d;
+    asm!(
+         "vmovapd {2}{{{1}}} {{z}}, [{0}]",
+         in(reg) mem_addr,
+         in(kreg) k,
+         out(xmm_reg) dst,
+         options(pure, readonly, nostack)
+    );
+    dst
 }
 
-/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_ss&expand=2864)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi32)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpss))]
-pub unsafe fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vgetexpss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        _mm_setzero_ps().as_f32x4(),
-        k,
-        _MM_FROUND_NO_EXC,
-    ))
+pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) {
+    asm!(
+         "vmovdqu32 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(zmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_sd&expand=2859)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi64)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpsd))]
-pub unsafe fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d {
-    transmute(vgetexpsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        _mm_setzero_pd().as_f64x2(),
-        0b1,
-        _MM_FROUND_NO_EXC,
-    ))
+pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) {
+    asm!(
+         "vmovdqu64 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(zmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_sd&expand=2860)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_ps)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpsd))]
-pub unsafe fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vgetexpsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        src.as_f64x2(),
-        k,
-        _MM_FROUND_NO_EXC,
-    ))
+pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) {
+    asm!(
+         "vmovups [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(zmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_sd&expand=2861)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_pd)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpsd))]
-pub unsafe fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vgetexpsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        _mm_setzero_pd().as_f64x2(),
-        k,
-        _MM_FROUND_NO_EXC,
-    ))
+pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) {
+    asm!(
+         "vmovupd [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(zmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_ss&expand=2898)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi32)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm_getmant_ss<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_imm4!(NORM);
-    static_assert_imm2!(SIGN);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, 0b1, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) {
+    asm!(
+         "vmovdqu32 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(ymm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_ss&expand=2899)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi64)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(4, 5)]
-pub unsafe fn _mm_mask_getmant_ss<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_imm4!(NORM);
-    static_assert_imm2!(SIGN);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) {
+    asm!(
+         "vmovdqu64 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(ymm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_ss&expand=2900)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_ps)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm_maskz_getmant_ss<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_imm4!(NORM);
-    static_assert_imm2!(SIGN);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) {
+    asm!(
+         "vmovups [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(ymm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_sd&expand=2895)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_pd)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm_getmant_sd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_imm4!(NORM);
-    static_assert_imm2!(SIGN);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, 0b1, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) {
+    asm!(
+         "vmovupd [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(ymm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_sd&expand=2896)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi32)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(4, 5)]
-pub unsafe fn _mm_mask_getmant_sd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_imm4!(NORM);
-    static_assert_imm2!(SIGN);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) {
+    asm!(
+         "vmovdqu32 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(xmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
-///    _MM_MANT_NORM_1_2     // interval [1, 2)\
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
-/// The sign is determined by sc which can take the following values:\
-///    _MM_MANT_SIGN_src     // sign = sign(src)\
-///    _MM_MANT_SIGN_zero    // sign = 0\
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_sd&expand=2897)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi64)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
-#[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm_maskz_getmant_sd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_imm4!(NORM);
-    static_assert_imm2!(SIGN);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) {
+    asm!(
+         "vmovdqu64 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(xmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
-///    _MM_FROUND_TO_NEG_INF     // round down\
-///    _MM_FROUND_TO_POS_INF     // round up\
-///    _MM_FROUND_TO_ZERO        // truncate\
-///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_ss&expand=4802)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_ps)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 255))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_roundscale_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_imm8!(IMM8);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vrndscaless(a, b, zero, 0b11111111, IMM8, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) {
+    asm!(
+         "vmovups [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(xmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
-///    _MM_FROUND_TO_NEG_INF     // round down\
-///    _MM_FROUND_TO_POS_INF     // round up\
-///    _MM_FROUND_TO_ZERO        // truncate\
-///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_ss&expand=4800)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_pd)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_roundscale_ss<const IMM8: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_imm8!(IMM8);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vrndscaless(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) {
+    asm!(
+         "vmovupd [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(xmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
-///    _MM_FROUND_TO_NEG_INF     // round down\
-///    _MM_FROUND_TO_POS_INF     // round up\
-///    _MM_FROUND_TO_ZERO        // truncate\
-///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_ss&expand=4801)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_epi32)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_roundscale_ss<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_imm8!(IMM8);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vrndscaless(a, b, zero, k, IMM8, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) {
+    asm!(
+         "vmovdqa32 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(zmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
-///    _MM_FROUND_TO_NEG_INF     // round down\
-///    _MM_FROUND_TO_POS_INF     // round up\
-///    _MM_FROUND_TO_ZERO        // truncate\
-///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_sd&expand=4799)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_epi64)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 255))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_roundscale_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_imm8!(IMM8);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vrndscalesd(a, b, zero, 0b11111111, IMM8, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) {
+    asm!(
+         "vmovdqa64 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(zmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
-///    _MM_FROUND_TO_NEG_INF     // round down\
-///    _MM_FROUND_TO_POS_INF     // round up\
-///    _MM_FROUND_TO_ZERO        // truncate\
-///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_sd&expand=4797)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_ps)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_roundscale_sd<const IMM8: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_imm8!(IMM8);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vrndscalesd(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) {
+    asm!(
+         "vmovaps [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(zmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
-///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
-///    _MM_FROUND_TO_NEG_INF     // round down\
-///    _MM_FROUND_TO_POS_INF     // round up\
-///    _MM_FROUND_TO_ZERO        // truncate\
-///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_sd&expand=4798)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_pd)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_roundscale_sd<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_imm8!(IMM8);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vrndscalesd(a, b, zero, k, IMM8, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) {
+    asm!(
+         "vmovapd [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(zmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_ss&expand=4901)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_epi32)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefss))]
-pub unsafe fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    transmute(vscalefss(a, b, zero, 0b11111111, _MM_FROUND_CUR_DIRECTION))
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) {
+    asm!(
+         "vmovdqa32 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(ymm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_ss&expand=4899)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_epi64)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefss))]
-pub unsafe fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    transmute(vscalefss(a, b, src, k, _MM_FROUND_CUR_DIRECTION))
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) {
+    asm!(
+         "vmovdqa64 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(ymm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_ss&expand=4900)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_ps)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefss))]
-pub unsafe fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vscalefss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        _mm_setzero_ps().as_f32x4(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) {
+    asm!(
+         "vmovaps [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(ymm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_sd&expand=4898)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_pd)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefsd))]
-pub unsafe fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d {
-    transmute(vscalefsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        _mm_setzero_pd().as_f64x2(),
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx")]
+pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) {
+    asm!(
+         "vmovapd [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(ymm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Store packed 32-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_sd&expand=4896)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_epi32)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefsd))]
-pub unsafe fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vscalefsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        src.as_f64x2(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) {
+    asm!(
+         "vmovdqa32 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(xmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Store packed 64-bit integers from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_sd&expand=4897)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_epi64)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefsd))]
-pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vscalefsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        _mm_setzero_pd().as_f64x2(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) {
+    asm!(
+         "vmovdqa64 [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(xmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_ss&expand=2582)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ps)
 #[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213ss))]
-pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let mut fmadd: f32 = simd_extract(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
-        fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
-    }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) {
+    asm!(
+         "vmovaps [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(xmm_reg) a,
+         options(nostack)
+    );
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
+/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_ss&expand=2584)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_pd)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
+pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) {
+    asm!(
+         "vmovapd [{0}]{{{1}}}, {2}",
+         in(reg) mem_addr,
+         in(kreg) mask,
+         in(xmm_reg) a,
+         options(nostack)
+    );
+}
+
+/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_pd&expand=5002)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213ss))]
-pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    let mut fmadd: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
-        fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
-    }
-    let r = simd_insert(a, 0, fmadd);
+pub unsafe fn _mm512_setr_pd(
+    e0: f64,
+    e1: f64,
+    e2: f64,
+    e3: f64,
+    e4: f64,
+    e5: f64,
+    e6: f64,
+    e7: f64,
+) -> __m512d {
+    let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
     transmute(r)
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_ss&expand=2583)
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_pd&expand=4924)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213ss))]
-pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let mut fmadd: f32 = simd_extract(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
-        fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
-    }
-    let r = simd_insert(c, 0, fmadd);
-    transmute(r)
+pub unsafe fn _mm512_set_pd(
+    e0: f64,
+    e1: f64,
+    e2: f64,
+    e3: f64,
+    e4: f64,
+    e5: f64,
+    e6: f64,
+    e7: f64,
+) -> __m512d {
+    _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_sd&expand=2578)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_ss&expand=3832)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213sd))]
-pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let mut fmadd: f64 = simd_extract(a, 0);
+#[cfg_attr(test, assert_instr(vmovss))]
+pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut mov: f32 = extractsrc;
     if (k & 0b00000001) != 0 {
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
-        fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        mov = simd_extract(b, 0);
     }
-    let r = simd_insert(a, 0, fmadd);
+    let r = simd_insert(a, 0, mov);
     transmute(r)
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_sd&expand=2580)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_ss&expand=3833)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213sd))]
-pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    let mut fmadd: f64 = 0.;
+#[cfg_attr(test, assert_instr(vmovss))]
+pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut mov: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
-        fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        mov = simd_extract(b, 0);
     }
-    let r = simd_insert(a, 0, fmadd);
+    let r = simd_insert(a, 0, mov);
     transmute(r)
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_sd&expand=2579)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_sd&expand=3829)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213sd))]
-pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let mut fmadd: f64 = simd_extract(c, 0);
+#[cfg_attr(test, assert_instr(vmovsd))]
+pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut mov: f64 = extractsrc;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
+        mov = simd_extract(b, 0);
     }
-    let r = simd_insert(c, 0, fmadd);
+    let r = simd_insert(a, 0, mov);
     transmute(r)
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_ss&expand=2668)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_sd&expand=3830)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213ss))]
-pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let mut fmsub: f32 = simd_extract(a, 0);
+#[cfg_attr(test, assert_instr(vmovsd))]
+pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut mov: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
-        let extractc = -extractc;
-        fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        mov = simd_extract(b, 0);
     }
-    let r = simd_insert(a, 0, fmsub);
+    let r = simd_insert(a, 0, mov);
     transmute(r)
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_ss&expand=2670)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_ss&expand=159)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213ss))]
-pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    let mut fmsub: f32 = 0.;
+#[cfg_attr(test, assert_instr(vaddss))]
+pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut add: f32 = extractsrc;
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
-        let extractc = -extractc;
-        fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        add = extracta + extractb;
     }
-    let r = simd_insert(a, 0, fmsub);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_ss&expand=2669)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_ss&expand=160)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213ss))]
-pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let mut fmsub: f32 = simd_extract(c, 0);
+#[cfg_attr(test, assert_instr(vaddss))]
+pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut add: f32 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extractb: f32 = simd_extract(b, 0);
-        let extractc = -fmsub;
-        fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        add = extracta + extractb;
     }
-    let r = simd_insert(c, 0, fmsub);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_sd&expand=2664)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_sd&expand=155)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213sd))]
-pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let mut fmsub: f64 = simd_extract(a, 0);
+#[cfg_attr(test, assert_instr(vaddsd))]
+pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut add: f64 = extractsrc;
     if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
         let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
-        let extractc = -extractc;
-        fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        add = extracta + extractb;
     }
-    let r = simd_insert(a, 0, fmsub);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_sd&expand=2666)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_sd&expand=156)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213sd))]
-pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    let mut fmsub: f64 = 0.;
+#[cfg_attr(test, assert_instr(vaddsd))]
+pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut add: f64 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
-        let extractc = -extractc;
-        fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        add = extracta + extractb;
     }
-    let r = simd_insert(a, 0, fmsub);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_sd&expand=2665)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_ss&expand=5750)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213sd))]
-pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let mut fmsub: f64 = simd_extract(c, 0);
+#[cfg_attr(test, assert_instr(vsubss))]
+pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut add: f32 = extractsrc;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc = -fmsub;
-        fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta - extractb;
     }
-    let r = simd_insert(c, 0, fmsub);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_ss&expand=2748)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_ss&expand=5751)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss))]
-pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let mut fnmadd: f32 = simd_extract(a, 0);
+#[cfg_attr(test, assert_instr(vsubss))]
+pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut add: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta = -fnmadd;
+        let extracta: f32 = simd_extract(a, 0);
         let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
-        fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        add = extracta - extractb;
     }
-    let r = simd_insert(a, 0, fnmadd);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_ss&expand=2750)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_sd&expand=5746)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss))]
-pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    let mut fnmadd: f32 = 0.;
+#[cfg_attr(test, assert_instr(vsubsd))]
+pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut add: f64 = extractsrc;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extracta = -extracta;
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
-        fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta - extractb;
     }
-    let r = simd_insert(a, 0, fnmadd);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_ss&expand=2749)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_sd&expand=5747)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss))]
-pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let mut fnmadd: f32 = simd_extract(c, 0);
+#[cfg_attr(test, assert_instr(vsubsd))]
+pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut add: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extracta = -extracta;
-        let extractb: f32 = simd_extract(b, 0);
-        fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta - extractb;
     }
-    let r = simd_insert(c, 0, fnmadd);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_sd&expand=2744)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_ss&expand=3950)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd))]
-pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let mut fnmadd: f64 = simd_extract(a, 0);
+#[cfg_attr(test, assert_instr(vmulss))]
+pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut add: f32 = extractsrc;
     if (k & 0b00000001) != 0 {
-        let extracta = -fnmadd;
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
-        fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta * extractb;
     }
-    let r = simd_insert(a, 0, fnmadd);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_sd&expand=2746)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_ss&expand=3951)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd))]
-pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    let mut fnmadd: f64 = 0.;
+#[cfg_attr(test, assert_instr(vmulss))]
+pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut add: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extracta = -extracta;
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
-        fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta * extractb;
     }
-    let r = simd_insert(a, 0, fnmadd);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_sd&expand=2745)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_sd&expand=3947)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd))]
-pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let mut fnmadd: f64 = simd_extract(c, 0);
+#[cfg_attr(test, assert_instr(vmulsd))]
+pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut add: f64 = extractsrc;
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
-        let extracta = -extracta;
         let extractb: f64 = simd_extract(b, 0);
-        fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
+        add = extracta * extractb;
     }
-    let r = simd_insert(c, 0, fnmadd);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_ss&expand=2796)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_sd&expand=3948)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss))]
-pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let mut fnmsub: f32 = simd_extract(a, 0);
+#[cfg_attr(test, assert_instr(vmulsd))]
+pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut add: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta = -fnmsub;
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
-        let extractc = -extractc;
-        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta * extractb;
     }
-    let r = simd_insert(a, 0, fnmsub);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_ss&expand=2798)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_ss&expand=2181)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss))]
-pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    let mut fnmsub: f32 = 0.;
+#[cfg_attr(test, assert_instr(vdivss))]
+pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut add: f32 = extractsrc;
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
-        let extracta = -extracta;
         let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
-        let extractc = -extractc;
-        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        add = extracta / extractb;
     }
-    let r = simd_insert(a, 0, fnmsub);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_ss&expand=2797)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_ss&expand=2182)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss))]
-pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let mut fnmsub: f32 = simd_extract(c, 0);
+#[cfg_attr(test, assert_instr(vdivss))]
+pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut add: f32 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
-        let extracta = -extracta;
         let extractb: f32 = simd_extract(b, 0);
-        let extractc = -fnmsub;
-        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        add = extracta / extractb;
     }
-    let r = simd_insert(c, 0, fnmsub);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_sd&expand=2792)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_sd&expand=2178)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd))]
-pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let mut fnmsub: f64 = simd_extract(a, 0);
+#[cfg_attr(test, assert_instr(vdivsd))]
+pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut add: f64 = extractsrc;
     if (k & 0b00000001) != 0 {
-        let extracta = -fnmsub;
+        let extracta: f64 = simd_extract(a, 0);
         let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
-        let extractc = -extractc;
-        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        add = extracta / extractb;
     }
-    let r = simd_insert(a, 0, fnmsub);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_sd&expand=2794)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_sd&expand=2179)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd))]
-pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    let mut fnmsub: f64 = 0.;
+#[cfg_attr(test, assert_instr(vdivsd))]
+pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut add: f64 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
-        let extracta = -extracta;
         let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
-        let extractc = -extractc;
-        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+        add = extracta / extractb;
     }
-    let r = simd_insert(a, 0, fnmsub);
+    let r = simd_insert(a, 0, add);
     transmute(r)
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_sd&expand=2793)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd))]
-pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let mut fnmsub: f64 = simd_extract(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extracta = -extracta;
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc = -fnmsub;
-        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
-    }
-    let r = simd_insert(c, 0, fnmsub);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vmaxss))]
+pub unsafe fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vmaxss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        src.as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
 }
 
-/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_ss&expand=3673)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_add_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vaddss(a, b, zero, 0b1, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vmaxss))]
+pub unsafe fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vmaxss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
 }
 
-/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_sd&expand=3669)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_add_round_ss<const ROUNDING: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vaddss(a, b, src, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vmaxsd))]
+pub unsafe fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vmaxsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
 }
 
-/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_add_round_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vaddss(a, b, zero, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vmaxsd))]
+pub unsafe fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vmaxsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
 }
 
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_ss&expand=3786)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_add_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vaddsd(a, b, zero, 0b1, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vminss))]
+pub unsafe fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vminss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        src.as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
 }
 
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_ss&expand=3787)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_add_round_sd<const ROUNDING: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vaddsd(a, b, src, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vminss))]
+pub unsafe fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vminss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
 }
 
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_sd&expand=3783)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_add_round_sd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vaddsd(a, b, zero, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vminsd))]
+pub unsafe fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vminsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
 }
 
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_sd&expand=3784)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_sub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vsubss(a, b, zero, 0b1, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vminsd))]
+pub unsafe fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vminsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
 }
 
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_ss&expand=5387)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_sub_round_ss<const ROUNDING: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vsubss(a, b, src, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vsqrtss))]
+pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vsqrtss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        src.as_f32x4(),
+        k,
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+    ))
 }
 
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_ss&expand=5388)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_sub_round_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vsubss(a, b, zero, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vsqrtss))]
+pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vsqrtss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+    ))
 }
 
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_sd&expand=5384)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_sub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vsubsd(a, b, zero, 0b1, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vsqrtsd))]
+pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vsqrtsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+    ))
 }
 
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_sd&expand=5385)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_sub_round_sd<const ROUNDING: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vsubsd(a, b, src, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vsqrtsd))]
+pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vsqrtsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+    ))
 }
 
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_ss&expand=4825)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_sub_round_sd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vsubsd(a, b, zero, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub unsafe fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 {
+    transmute(vrsqrt14ss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        0b1,
+    ))
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_ss&expand=4823)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_mul_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vmulss(a, b, zero, 0b1, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub unsafe fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_ss&expand=4824)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_mul_round_ss<const ROUNDING: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vmulss(a, b, src, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub unsafe fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vrsqrt14ss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+    ))
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_sd&expand=4822)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_mul_round_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vmulss(a, b, zero, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub unsafe fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrsqrt14sd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b1,
+    ))
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_sd&expand=4820)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_mul_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vmulsd(a, b, zero, 0b1, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub unsafe fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_sd&expand=4821)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_mul_round_sd<const ROUNDING: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vmulsd(a, b, src, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub unsafe fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrsqrt14sd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+    ))
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_ss&expand=4508)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_mul_round_sd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vmulsd(a, b, zero, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub unsafe fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 {
+    transmute(vrcp14ss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        0b1,
+    ))
 }
 
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_ss&expand=4506)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_div_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vdivss(a, b, zero, 0b1, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub unsafe fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
 }
 
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_ss&expand=4507)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_div_round_ss<const ROUNDING: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vdivss(a, b, src, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub unsafe fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vrcp14ss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+    ))
 }
 
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_sd&expand=4505)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_div_round_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vdivss(a, b, zero, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub unsafe fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrcp14sd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b1,
+    ))
 }
 
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_sd&expand=4503)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_div_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vdivsd(a, b, zero, 0b1, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub unsafe fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
 }
 
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_sd&expand=4504)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_div_round_sd<const ROUNDING: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vdivsd(a, b, src, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrcp14sd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+    ))
 }
 
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_ss&expand=2862)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_div_round_sd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vdivsd(a, b, zero, k, ROUNDING);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub unsafe fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 {
+    transmute(vgetexpss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        0b1,
+        _MM_FROUND_NO_EXC,
+    ))
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_ss&expand=2863)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_max_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vmaxss(a, b, zero, 0b1, SAE);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub unsafe fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vgetexpss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        src.as_f32x4(),
+        k,
+        _MM_FROUND_NO_EXC,
+    ))
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_ss&expand=2864)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_max_round_ss<const SAE: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vmaxss(a, b, src, k, SAE);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub unsafe fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vgetexpss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_NO_EXC,
+    ))
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_sd&expand=2859)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_max_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vmaxss(a, b, zero, k, SAE);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub unsafe fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(vgetexpsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b1,
+        _MM_FROUND_NO_EXC,
+    ))
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_sd&expand=2860)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_max_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vmaxsd(a, b, zero, 0b1, SAE);
-    transmute(r)
-}
-
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_max_round_sd<const SAE: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vmaxsd(a, b, src, k, SAE);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub unsafe fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vgetexpsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_NO_EXC,
+    ))
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_sd&expand=2861)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_max_round_sd<const SAE: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vmaxsd(a, b, zero, k, SAE);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub unsafe fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vgetexpsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_NO_EXC,
+    ))
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_ss&expand=2898)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_min_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_sae!(SAE);
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm_getmant_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    let r = vminss(a, b, zero, 0b1, SAE);
+    let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, 0b1, _MM_FROUND_CUR_DIRECTION);
     transmute(r)
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_ss&expand=2899)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_min_round_ss<const SAE: i32>(
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(4, 5)]
+pub unsafe fn _mm_mask_getmant_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_sae!(SAE);
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    let r = vminss(a, b, src, k, SAE);
+    let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
     transmute(r)
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_ss&expand=2900)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_min_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    static_assert_sae!(SAE);
+#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm_maskz_getmant_ss<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    let r = vminss(a, b, zero, k, SAE);
+    let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION);
     transmute(r)
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.\
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_sd&expand=2895)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_min_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_sae!(SAE);
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(2, 3)]
+pub unsafe fn _mm_getmant_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    let r = vminsd(a, b, zero, 0b1, SAE);
+    let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, 0b1, _MM_FROUND_CUR_DIRECTION);
     transmute(r)
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_sd&expand=2896)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_min_round_sd<const SAE: i32>(
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(4, 5)]
+pub unsafe fn _mm_mask_getmant_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_sae!(SAE);
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    let r = vminsd(a, b, src, k, SAE);
+    let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
     transmute(r)
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
+///    _MM_MANT_NORM_1_2     // interval [1, 2)\
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
+/// The sign is determined by sc which can take the following values:\
+///    _MM_MANT_SIGN_src     // sign = sign(src)\
+///    _MM_MANT_SIGN_zero    // sign = 0\
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_sd&expand=2897)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_min_round_sd<const SAE: i32>(
+#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
+#[rustc_legacy_const_generics(3, 4)]
+pub unsafe fn _mm_maskz_getmant_sd<
+    const NORM: _MM_MANTISSA_NORM_ENUM,
+    const SIGN: _MM_MANTISSA_SIGN_ENUM,
+>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_sae!(SAE);
+    static_assert_imm4!(NORM);
+    static_assert_imm2!(SIGN);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    let r = vminsd(a, b, zero, k, SAE);
+    let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION);
     transmute(r)
 }
 
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_ss&expand=4802)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 255))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_sqrt_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
+pub unsafe fn _mm_roundscale_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_imm8!(IMM8);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    let r = vsqrtss(a, b, zero, 0b1, ROUNDING);
+    let r = vrndscaless(a, b, zero, 0b11111111, IMM8, _MM_FROUND_CUR_DIRECTION);
     transmute(r)
 }
 
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_ss&expand=4800)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_sqrt_round_ss<const ROUNDING: i32>(
+pub unsafe fn _mm_mask_roundscale_ss<const IMM8: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
+    static_assert_imm8!(IMM8);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    let r = vsqrtss(a, b, src, k, ROUNDING);
+    let r = vrndscaless(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
     transmute(r)
 }
 
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_ss&expand=4801)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_sqrt_round_ss<const ROUNDING: i32>(
+pub unsafe fn _mm_maskz_roundscale_ss<const IMM8: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
+    static_assert_imm8!(IMM8);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    let r = vsqrtss(a, b, zero, k, ROUNDING);
+    let r = vrndscaless(a, b, zero, k, IMM8, _MM_FROUND_CUR_DIRECTION);
     transmute(r)
 }
 
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_sd&expand=4799)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 255))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_sqrt_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_rounding!(ROUNDING);
+pub unsafe fn _mm_roundscale_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_imm8!(IMM8);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    let r = vsqrtsd(a, b, zero, 0b1, ROUNDING);
+    let r = vrndscalesd(a, b, zero, 0b11111111, IMM8, _MM_FROUND_CUR_DIRECTION);
     transmute(r)
 }
 
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_sd&expand=4797)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_sqrt_round_sd<const ROUNDING: i32>(
+pub unsafe fn _mm_mask_roundscale_sd<const IMM8: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
+    static_assert_imm8!(IMM8);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    let r = vsqrtsd(a, b, src, k, ROUNDING);
+    let r = vrndscalesd(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
     transmute(r)
 }
 
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_sd&expand=4798)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
+pub unsafe fn _mm_maskz_roundscale_sd<const IMM8: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
+    static_assert_imm8!(IMM8);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    let r = vsqrtsd(a, b, zero, k, ROUNDING);
+    let r = vrndscalesd(a, b, zero, k, IMM8, _MM_FROUND_CUR_DIRECTION);
     transmute(r)
 }
 
-/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_ss&expand=2856)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_ss&expand=4901)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_getexp_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_sae!(SAE);
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub unsafe fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    let r = vgetexpss(a, b, zero, 0b1, SAE);
-    transmute(r)
+    transmute(vscalefss(a, b, zero, 0b11111111, _MM_FROUND_CUR_DIRECTION))
 }
 
-/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_ss&expand=2857)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_ss&expand=4899)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_getexp_round_ss<const SAE: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_sae!(SAE);
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub unsafe fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    let r = vgetexpss(a, b, src, k, SAE);
-    transmute(r)
+    transmute(vscalefss(a, b, src, k, _MM_FROUND_CUR_DIRECTION))
 }
 
-/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_ss&expand=2858)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_ss&expand=4900)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_getexp_round_ss<const SAE: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    let r = vgetexpss(a, b, zero, k, SAE);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub unsafe fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vscalefss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
 }
 
-/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_sd&expand=2853)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_sd&expand=4898)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_getexp_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    let r = vgetexpsd(a, b, zero, 0b1, SAE);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub unsafe fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(vscalefsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
 }
 
-/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_sd&expand=2854)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_sd&expand=4896)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
-#[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_getexp_round_sd<const SAE: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vgetexpsd(a, b, src, k, SAE);
-    transmute(r)
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub unsafe fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vscalefsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
 }
 
-/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_sd&expand=2855)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_sd&expand=4897)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_getexp_round_sd<const SAE: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_sae!(SAE);
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vscalefsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_ss&expand=2582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss))]
+pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let mut fmadd: f32 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_ss&expand=2584)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss))]
+pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let mut fmadd: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_ss&expand=2583)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss))]
+pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let mut fmadd: f32 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_sd&expand=2578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd))]
+pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let mut fmadd: f64 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_sd&expand=2580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd))]
+pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let mut fmadd: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_sd&expand=2579)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd))]
+pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let mut fmadd: f64 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_ss&expand=2668)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213ss))]
+pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let mut fmsub: f32 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_ss&expand=2670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213ss))]
+pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let mut fmsub: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_ss&expand=2669)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213ss))]
+pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let mut fmsub: f32 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc = -fmsub;
+        fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_sd&expand=2664)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213sd))]
+pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let mut fmsub: f64 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_sd&expand=2666)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213sd))]
+pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let mut fmsub: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_sd&expand=2665)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmsub213sd))]
+pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let mut fmsub: f64 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc = -fmsub;
+        fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_ss&expand=2748)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213ss))]
+pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let mut fnmadd: f32 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta = -fnmadd;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_ss&expand=2750)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213ss))]
+pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let mut fnmadd: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_ss&expand=2749)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213ss))]
+pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let mut fnmadd: f32 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract(b, 0);
+        fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_sd&expand=2744)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213sd))]
+pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let mut fnmadd: f64 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta = -fnmadd;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_sd&expand=2746)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213sd))]
+pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let mut fnmadd: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_sd&expand=2745)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmadd213sd))]
+pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let mut fnmadd: f64 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract(b, 0);
+        fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fnmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_ss&expand=2796)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213ss))]
+pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let mut fnmsub: f32 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta = -fnmsub;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_ss&expand=2798)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213ss))]
+pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let mut fnmsub: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_ss&expand=2797)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213ss))]
+pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let mut fnmsub: f32 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc = -fnmsub;
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_sd&expand=2792)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213sd))]
+pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let mut fnmsub: f64 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta = -fnmsub;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_sd&expand=2794)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213sd))]
+pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let mut fnmsub: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        let extractc = -extractc;
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fnmsub);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_sd&expand=2793)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfnmsub213sd))]
+pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let mut fnmsub: f64 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc = -fnmsub;
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fnmsub);
+    transmute(r)
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_add_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vaddss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_add_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vaddss(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_add_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vaddss(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_add_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vaddsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_add_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vaddsd(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_add_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vaddsd(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vsubss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sub_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vsubss(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sub_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vsubss(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vsubsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sub_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vsubsd(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sub_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vsubsd(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_mul_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vmulss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_mul_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vmulss(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_mul_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vmulss(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_mul_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vmulsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_mul_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vmulsd(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_mul_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vmulsd(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_div_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vdivss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_div_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vdivss(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_div_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vdivss(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_div_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vdivsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_div_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vdivsd(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_div_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vdivsd(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_max_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vmaxss(a, b, zero, 0b1, SAE);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_max_round_ss<const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vmaxss(a, b, src, k, SAE);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_max_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vmaxss(a, b, zero, k, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_max_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vmaxsd(a, b, zero, 0b1, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_max_round_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vmaxsd(a, b, src, k, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_max_round_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vmaxsd(a, b, zero, k, SAE);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_min_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vminss(a, b, zero, 0b1, SAE);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_min_round_ss<const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vminss(a, b, src, k, SAE);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_min_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vminss(a, b, zero, k, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_min_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vminsd(a, b, zero, 0b1, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_min_round_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vminsd(a, b, src, k, SAE);
+    transmute(r)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_min_round_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vminsd(a, b, zero, k, SAE);
+    transmute(r)
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sqrt_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vsqrtss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sqrt_round_ss<const ROUNDING: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vsqrtss(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sqrt_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vsqrtss(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sqrt_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vsqrtsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sqrt_round_sd<const ROUNDING: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vsqrtsd(a, b, src, k, ROUNDING);
+    transmute(r)
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vsqrtsd(a, b, zero, k, ROUNDING);
+    transmute(r)
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_ss&expand=2856)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_getexp_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vgetexpss(a, b, zero, 0b1, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_ss&expand=2857)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_getexp_round_ss<const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    let r = vgetexpss(a, b, src, k, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_ss&expand=2858)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_getexp_round_ss<const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_sae!(SAE);
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    let r = vgetexpss(a, b, zero, k, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_sd&expand=2853)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_getexp_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vgetexpsd(a, b, zero, 0b1, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_sd&expand=2854)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_getexp_round_sd<const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
+    let r = vgetexpsd(a, b, src, k, SAE);
+    transmute(r)
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_sd&expand=2855)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_getexp_round_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
@@ -41325,3266 +42621,4231 @@ mod tests {
         let src = _mm_set1_epi32(0);
         let r = _mm_mask_cvtps_epu32(src, 0, a);
         assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtps_epu32(src, 0b00001111, a);
-        let e = _mm_set_epi32(12, 14, 14, 16);
+        let r = _mm_mask_cvtps_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvtps_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtps_epu32(0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi8_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepi8_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi8_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi8_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi8_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepi8_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi8_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi8_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi8_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepi8_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi8_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepi8_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi8_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu8_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepu8_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepu8_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu8_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepu8_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepu8_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu8_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu8_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu8_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepu8_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu8_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu8_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu8_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi16_epi32(a);
+        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_epi32(-1);
+        let r = _mm512_mask_cvtepi16_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtepi16_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi16_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtepi16_epi32(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepi16_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi16_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtepi16_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi16_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepi16_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi16_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtps_epu32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let r = _mm_maskz_cvtps_epu32(0, a);
+    unsafe fn test_mm_maskz_cvtepi16_epi32() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_cvtepi16_epi32(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtps_epu32(0b00001111, a);
-        let e = _mm_set_epi32(12, 14, 14, 16);
+        let r = _mm_maskz_cvtepi16_epi32(0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepi8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepi8_epi32(a);
+    unsafe fn test_mm512_cvtepu16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu16_epi32(a);
         let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepi8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    unsafe fn test_mm512_mask_cvtepu16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
         let src = _mm512_set1_epi32(-1);
-        let r = _mm512_mask_cvtepi8_epi32(src, 0, a);
+        let r = _mm512_mask_cvtepu16_epi32(src, 0, a);
         assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtepi8_epi32(src, 0b00000000_11111111, a);
+        let r = _mm512_mask_cvtepu16_epi32(src, 0b00000000_11111111, a);
         let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepi8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepi8_epi32(0, a);
+    unsafe fn test_mm512_maskz_cvtepu16_epi32() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu16_epi32(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtepi8_epi32(0b00000000_11111111, a);
+        let r = _mm512_maskz_cvtepu16_epi32(0b00000000_11111111, a);
         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    unsafe fn test_mm256_mask_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
         let src = _mm256_set1_epi32(-1);
-        let r = _mm256_mask_cvtepi8_epi32(src, 0, a);
+        let r = _mm256_mask_cvtepu16_epi32(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvtepi8_epi32(src, 0b11111111, a);
+        let r = _mm256_mask_cvtepu16_epi32(src, 0b11111111, a);
         let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_maskz_cvtepi8_epi32(0, a);
+    unsafe fn test_mm256_maskz_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu16_epi32(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvtepi8_epi32(0b11111111, a);
+        let r = _mm256_maskz_cvtepu16_epi32(0b11111111, a);
         let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    unsafe fn test_mm_mask_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
         let src = _mm_set1_epi32(-1);
-        let r = _mm_mask_cvtepi8_epi32(src, 0, a);
+        let r = _mm_mask_cvtepu16_epi32(src, 0, a);
         assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepi8_epi32(src, 0b00001111, a);
+        let r = _mm_mask_cvtepu16_epi32(src, 0b00001111, a);
         let e = _mm_set_epi32(12, 13, 14, 15);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm_maskz_cvtepi8_epi32(0, a);
+    unsafe fn test_mm_maskz_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu16_epi32(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepi8_epi32(0b00001111, a);
+        let r = _mm_maskz_cvtepu16_epi32(0b00001111, a);
         let e = _mm_set_epi32(12, 13, 14, 15);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepu8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepu8_epi32(a);
-        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm512_cvtepi32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_ps(a);
+        let e = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepu8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm512_set1_epi32(-1);
-        let r = _mm512_mask_cvtepu8_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtepu8_epi32(src, 0b00000000_11111111, a);
-        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm512_mask_cvtepi32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_cvtepi32_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvtepi32_ps(src, 0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepu8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepu8_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtepu8_epi32(0b00000000_11111111, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm512_maskz_cvtepi32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvtepi32_ps(0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepu8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm256_set1_epi32(-1);
-        let r = _mm256_mask_cvtepu8_epi32(src, 0, a);
+    unsafe fn test_mm256_mask_cvtepi32_ps() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm256_set1_ps(-1.);
+        let r = _mm256_mask_cvtepi32_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_cvtepi32_ps(src, 0b11111111, a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_ps() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_cvtepi32_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_cvtepi32_ps(0b11111111, a);
+        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_ps() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let src = _mm_set1_ps(-1.);
+        let r = _mm_mask_cvtepi32_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_cvtepi32_ps(src, 0b00001111, a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi32_ps() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let r = _mm_maskz_cvtepi32_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_cvtepi32_ps(0b00001111, a);
+        let e = _mm_set_ps(1., 2., 3., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepu32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepu32_ps(a);
+        let e = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepu32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm512_set1_ps(-1.);
+        let r = _mm512_mask_cvtepu32_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvtepu32_ps(src, 0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepu32_ps() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepu32_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvtepu32_ps(0b00000000_11111111, a);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtepi32_epi16() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_epi16(a);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtepi32_epi16() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi16(-1);
+        let r = _mm512_mask_cvtepi32_epi16(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvtepu8_epi32(src, 0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_cvtepi32_epi16(src, 0b00000000_11111111, a);
+        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m256i(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepu8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_maskz_cvtepu8_epi32(0, a);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtepi32_epi16() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_epi16(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvtepu8_epi32(0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_epi16(0b00000000_11111111, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepu8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm_set1_epi32(-1);
-        let r = _mm_mask_cvtepu8_epi32(src, 0, a);
+    unsafe fn test_mm256_cvtepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_cvtepi32_epi16(a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm256_mask_cvtepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_cvtepi32_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtepi32_epi16(src, 0, a);
         assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepu8_epi32(src, 0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_mask_cvtepi32_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepu8_epi32() {
-        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm_maskz_cvtepu8_epi32(0, a);
+    unsafe fn test_mm_maskz_cvtepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_maskz_cvtepi32_epi16(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepu8_epi32(0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_maskz_cvtepi32_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepi16_epi32() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepi16_epi32(a);
-        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm512_cvtepi32_epi8() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvtepi32_epi8(a);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepi16_epi32() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm512_set1_epi32(-1);
-        let r = _mm512_mask_cvtepi16_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtepi16_epi32(src, 0b00000000_11111111, a);
-        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm512_mask_cvtepi32_epi8() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi8(-1);
+        let r = _mm512_mask_cvtepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtepi32_epi8(src, 0b00000000_11111111, a);
+        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepi16_epi32() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepi16_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtepi16_epi32(0b00000000_11111111, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm512_maskz_cvtepi32_epi8() {
+        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtepi32_epi8(0b00000000_11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi16_epi32() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let src = _mm256_set1_epi32(-1);
-        let r = _mm256_mask_cvtepi16_epi32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvtepi16_epi32(src, 0b11111111, a);
-        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m256i(r, e);
+    unsafe fn test_mm256_cvtepi32_epi8() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_cvtepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi16_epi32() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_maskz_cvtepi16_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvtepi16_epi32(0b11111111, a);
-        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m256i(r, e);
+    unsafe fn test_mm256_mask_cvtepi32_epi8() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtepi32_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi16_epi32() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let src = _mm_set1_epi32(-1);
-        let r = _mm_mask_cvtepi16_epi32(src, 0, a);
+    unsafe fn test_mm256_maskz_cvtepi32_epi8() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtepi32_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepi32_epi8() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_cvtepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi32_epi8() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtepi32_epi8(src, 0, a);
         assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepi16_epi32(src, 0b00001111, a);
-        let e = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_mask_cvtepi32_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi16_epi32() {
-        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_maskz_cvtepi16_epi32(0, a);
+    unsafe fn test_mm_maskz_cvtepi32_epi8() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_maskz_cvtepi32_epi8(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepi16_epi32(0b00001111, a);
-        let e = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_maskz_cvtepi32_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepu16_epi32() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepu16_epi32(a);
-        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm512_cvtsepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let r = _mm512_cvtsepi32_epi16(a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
+        );
+        assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepu16_epi32() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm512_set1_epi32(-1);
-        let r = _mm512_mask_cvtepu16_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtepu16_epi32(src, 0b00000000_11111111, a);
-        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm512_mask_cvtsepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let src = _mm256_set1_epi16(-1);
+        let r = _mm512_mask_cvtsepi32_epi16(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtsepi32_epi16(src, 0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            -1, -1, -1, -1,
+            -1, -1, -1, -1,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
+        );
+        assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepu16_epi32() {
-        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepu16_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtepu16_epi32(0b00000000_11111111, a);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm512_maskz_cvtsepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
+        );
+        let r = _mm512_maskz_cvtsepi32_epi16(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtsepi32_epi16(0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi16(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
+        );
+        assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepu16_epi32() {
-        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm256_set1_epi32(-1);
-        let r = _mm256_mask_cvtepu16_epi32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvtepu16_epi32(src, 0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
+    unsafe fn test_mm256_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_cvtsepi32_epi16(a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepu16_epi32() {
-        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_maskz_cvtepu16_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvtepu16_epi32(0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
+    unsafe fn test_mm256_mask_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm256_mask_cvtsepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtsepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_cvtsepi32_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepu16_epi32() {
-        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm_set1_epi32(-1);
-        let r = _mm_mask_cvtepu16_epi32(src, 0, a);
+    unsafe fn test_mm_mask_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtsepi32_epi16(src, 0, a);
         assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepu16_epi32(src, 0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_mask_cvtsepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepu16_epi32() {
-        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm_maskz_cvtepu16_epi32(0, a);
+    unsafe fn test_mm_maskz_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_maskz_cvtsepi32_epi16(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepu16_epi32(0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_maskz_cvtsepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepi32_ps() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepi32_ps(a);
-        let e = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+    unsafe fn test_mm512_cvtsepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
         );
-        assert_eq_m512(r, e);
+        let r = _mm512_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepi32_ps() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm512_set1_ps(-1.);
-        let r = _mm512_mask_cvtepi32_ps(src, 0, a);
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_cvtepi32_ps(src, 0b00000000_11111111, a);
-        let e = _mm512_set_ps(
-            -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
+    unsafe fn test_mm512_mask_cvtsepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
         );
-        assert_eq_m512(r, e);
+        let src = _mm_set1_epi8(-1);
+        let r = _mm512_mask_cvtsepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm512_mask_cvtsepi32_epi8(src, 0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            -1, -1, -1, -1,
+            -1, -1, -1, -1,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepi32_ps() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepi32_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvtepi32_ps(0b00000000_11111111, a);
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
+    unsafe fn test_mm512_maskz_cvtsepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
         );
-        assert_eq_m512(r, e);
+        let r = _mm512_maskz_cvtsepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm512_maskz_cvtsepi32_epi8(0b00000000_11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi32_ps() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let src = _mm256_set1_ps(-1.);
-        let r = _mm256_mask_cvtepi32_ps(src, 0, a);
-        assert_eq_m256(r, src);
-        let r = _mm256_mask_cvtepi32_ps(src, 0b11111111, a);
-        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
+    unsafe fn test_mm256_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi32_ps() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm256_maskz_cvtepi32_ps(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_cvtepi32_ps(0b11111111, a);
-        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
-        assert_eq_m256(r, e);
+    unsafe fn test_mm256_mask_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtsepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi32_epi8(src, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi32_ps() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let src = _mm_set1_ps(-1.);
-        let r = _mm_mask_cvtepi32_ps(src, 0, a);
-        assert_eq_m128(r, src);
-        let r = _mm_mask_cvtepi32_ps(src, 0b00001111, a);
-        let e = _mm_set_ps(1., 2., 3., 4.);
-        assert_eq_m128(r, e);
+    unsafe fn test_mm256_maskz_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_maskz_cvtsepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi32_epi8(0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi32_ps() {
-        let a = _mm_set_epi32(1, 2, 3, 4);
-        let r = _mm_maskz_cvtepi32_ps(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_cvtepi32_ps(0b00001111, a);
-        let e = _mm_set_ps(1., 2., 3., 4.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepu32_ps() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepu32_ps(a);
-        let e = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+    unsafe fn test_mm_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let r = _mm_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
         );
-        assert_eq_m512(r, e);
+        assert_eq_m128i(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepu32_ps() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let src = _mm512_set1_ps(-1.);
-        let r = _mm512_mask_cvtepu32_ps(src, 0, a);
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_cvtepu32_ps(src, 0b00000000_11111111, a);
-        let e = _mm512_set_ps(
-            -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtsepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi32_epi8(src, 0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
         );
-        assert_eq_m512(r, e);
+        assert_eq_m128i(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepu32_ps() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepu32_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvtepu32_ps(0b00000000_11111111, a);
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let r = _mm_maskz_cvtsepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi32_epi8(0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
         );
-        assert_eq_m512(r, e);
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepi32_epi16() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepi32_epi16(a);
-        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    unsafe fn test_mm512_cvtusepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_cvtusepi32_epi16(a);
+        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
         assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepi32_epi16() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    unsafe fn test_mm512_mask_cvtusepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
         let src = _mm256_set1_epi16(-1);
-        let r = _mm512_mask_cvtepi32_epi16(src, 0, a);
+        let r = _mm512_mask_cvtusepi32_epi16(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtepi32_epi16(src, 0b00000000_11111111, a);
-        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_cvtusepi32_epi16(src, 0b00000000_11111111, a);
+        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
         assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepi32_epi16() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepi32_epi16(0, a);
+    unsafe fn test_mm512_maskz_cvtusepi32_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_maskz_cvtusepi32_epi16(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtepi32_epi16(0b00000000_11111111, a);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtusepi32_epi16(0b00000000_11111111, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
         assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvtepi32_epi16() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_cvtepi32_epi16(a);
-        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_mm256_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_cvtusepi32_epi16(a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi32_epi16() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let src = _mm_set1_epi16(-1);
-        let r = _mm256_mask_cvtepi32_epi16(src, 0, a);
+    unsafe fn test_mm256_mask_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtusepi32_epi16(src, 0, a);
         assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtepi32_epi16(src, 0b11111111, a);
-        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_mask_cvtusepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi32_epi16() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_maskz_cvtepi32_epi16(0, a);
+    unsafe fn test_mm256_maskz_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_cvtusepi32_epi16(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtepi32_epi16(0b11111111, a);
-        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtusepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvtepi32_epi16() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let r = _mm_cvtepi32_epi16(a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+    unsafe fn test_mm_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
+        let r = _mm_cvtusepi32_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi32_epi16() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
+    unsafe fn test_mm_mask_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
         let src = _mm_set1_epi16(0);
-        let r = _mm_mask_cvtepi32_epi16(src, 0, a);
+        let r = _mm_mask_cvtusepi32_epi16(src, 0, a);
         assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepi32_epi16(src, 0b00001111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        let r = _mm_mask_cvtusepi32_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi32_epi16() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let r = _mm_maskz_cvtepi32_epi16(0, a);
+    unsafe fn test_mm_maskz_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
+        let r = _mm_maskz_cvtusepi32_epi16(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepi32_epi16(0b00001111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        let r = _mm_maskz_cvtusepi32_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtepi32_epi8() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_cvtepi32_epi8(a);
-        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    unsafe fn test_mm512_cvtusepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_cvtusepi32_epi8(a);
+        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtepi32_epi8() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    unsafe fn test_mm512_mask_cvtusepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
         let src = _mm_set1_epi8(-1);
-        let r = _mm512_mask_cvtepi32_epi8(src, 0, a);
+        let r = _mm512_mask_cvtusepi32_epi8(src, 0, a);
         assert_eq_m128i(r, src);
-        let r = _mm512_mask_cvtepi32_epi8(src, 0b00000000_11111111, a);
-        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_mask_cvtusepi32_epi8(src, 0b00000000_11111111, a);
+        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtepi32_epi8() {
-        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm512_maskz_cvtepi32_epi8(0, a);
+    unsafe fn test_mm512_maskz_cvtusepi32_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
+        );
+        let r = _mm512_maskz_cvtusepi32_epi8(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm512_maskz_cvtepi32_epi8(0b00000000_11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_maskz_cvtusepi32_epi8(0b00000000_11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvtepi32_epi8() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_cvtepi32_epi8(a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_mm256_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
+        let r = _mm256_cvtusepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtepi32_epi8() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_mm256_mask_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
         let src = _mm_set1_epi8(0);
-        let r = _mm256_mask_cvtepi32_epi8(src, 0, a);
+        let r = _mm256_mask_cvtusepi32_epi8(src, 0, a);
         assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtepi32_epi8(src, 0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_mask_cvtusepi32_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtepi32_epi8() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_maskz_cvtepi32_epi8(0, a);
+    unsafe fn test_mm256_maskz_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
+        let r = _mm256_maskz_cvtusepi32_epi8(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtepi32_epi8(0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtusepi32_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let r = _mm_cvtusepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtusepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi32_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let r = _mm_maskz_cvtusepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi32_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
         assert_eq_m128i(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvtepi32_epi8() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let r = _mm_cvtepi32_epi8(a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m512i(r, e);
+        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m512i(r, e);
+        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtepi32_epi8() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let src = _mm_set1_epi8(0);
-        let r = _mm_mask_cvtepi32_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtepi32_epi8(src, 0b00001111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundepi32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setr_ps(
+            0., -2., 2., -4., 4., -6., 6., -8., 8., 10., 10., 12., 12., 14., 14., 16.,
+        );
+        assert_eq_m512(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtepi32_epi8() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let r = _mm_maskz_cvtepi32_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtepi32_epi8(0b00001111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundepi32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
+        );
+        let e = _mm512_setr_ps(
+            0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtsepi32_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MAX,
+    unsafe fn test_mm512_maskz_cvt_roundepi32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
         );
-        let r = _mm512_cvtsepi32_epi16(a);
-        #[rustfmt::skip]
-        let e = _mm256_set_epi16(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i16::MIN, i16::MAX,
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
         );
-        assert_eq_m256i(r, e);
+        let e = _mm512_setr_ps(
+            0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtsepi32_epi16() {
+    unsafe fn test_mm512_cvt_roundepu32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
         #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MAX,
+        let e = _mm512_setr_ps(
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            8., 10., 10., 12.,
+            12., 14., 14., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundepu32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b00000000_11111111,
+            a,
         );
-        let src = _mm256_set1_epi16(-1);
-        let r = _mm512_mask_cvtsepi32_epi16(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtsepi32_epi16(src, 0b00000000_11111111, a);
         #[rustfmt::skip]
-        let e = _mm256_set_epi16(
-            -1, -1, -1, -1,
-            -1, -1, -1, -1,
-            8, 9, 10, 11,
-            12, 13, i16::MIN, i16::MAX,
+        let e = _mm512_setr_ps(
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
         );
-        assert_eq_m256i(r, e);
+        assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtsepi32_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MAX,
+    unsafe fn test_mm512_maskz_cvt_roundepu32_ps() {
+        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
+        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b00000000_11111111,
+            a,
         );
-        let r = _mm512_maskz_cvtsepi32_epi16(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtsepi32_epi16(0b00000000_11111111, a);
         #[rustfmt::skip]
-        let e = _mm256_set_epi16(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            8, 9, 10, 11,
-            12, 13, i16::MIN, i16::MAX,
+        let e = _mm512_setr_ps(
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
         );
         assert_eq_m256i(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvtsepi32_epi16() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_cvtsepi32_epi16(a);
-        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let src = _mm256_set1_epi16(0);
+        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtsepi32_epi16() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let src = _mm_set1_epi16(-1);
-        let r = _mm256_mask_cvtsepi32_epi16(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtsepi32_epi16(src, 0b11111111, a);
-        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        assert_eq_m128i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtsepi32_epi16() {
-        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm256_maskz_cvtsepi32_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtsepi32_epi16(0b11111111, a);
-        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_mm256_mask_cvt_roundps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvtsepi32_epi16() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let r = _mm_cvtsepi32_epi16(a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+    unsafe fn test_mm256_maskz_cvt_roundps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtsepi32_epi16() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
+    unsafe fn test_mm_mask_cvt_roundps_ph() {
+        let a = _mm_set1_ps(1.);
         let src = _mm_set1_epi16(0);
-        let r = _mm_mask_cvtsepi32_epi16(src, 0, a);
+        let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtsepi32_epi16(src, 0b11111111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtsepi32_epi16() {
-        let a = _mm_set_epi32(4, 5, 6, 7);
-        let r = _mm_maskz_cvtsepi32_epi16(0, a);
+    unsafe fn test_mm_maskz_cvt_roundps_ph() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtsepi32_epi16(0b11111111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtsepi32_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MAX,
-        );
-        let r = _mm512_cvtsepi32_epi8(a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i8::MIN, i8::MAX,
+    unsafe fn test_mm512_cvtps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_cvtps_ph::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
         );
-        assert_eq_m128i(r, e);
+        assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtsepi32_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MAX,
-        );
-        let src = _mm_set1_epi8(-1);
-        let r = _mm512_mask_cvtsepi32_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm512_mask_cvtsepi32_epi8(src, 0b00000000_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            -1, -1, -1, -1,
-            -1, -1, -1, -1,
-            8, 9, 10, 11,
-            12, 13, i8::MIN, i8::MAX,
-        );
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm512_mask_cvtps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let src = _mm256_set1_epi16(0);
+        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtsepi32_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MAX,
-        );
-        let r = _mm512_maskz_cvtsepi32_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm512_maskz_cvtsepi32_epi8(0b00000000_11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            8, 9, 10, 11,
-            12, 13, i8::MIN, i8::MAX,
-        );
+    unsafe fn test_mm512_maskz_cvtps_ph() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvtsepi32_epi8() {
-        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm256_cvtsepi32_epi8(a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            9, 10, 11, 12,
-            13, 14, 15, 16,
-        );
+    unsafe fn test_mm256_maskz_cvtps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtsepi32_epi8() {
-        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
-        let src = _mm_set1_epi8(0);
-        let r = _mm256_mask_cvtsepi32_epi8(src, 0, a);
+    unsafe fn test_mm_mask_cvtps_ph() {
+        let a = _mm_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtsepi32_epi8(src, 0b11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            9, 10, 11, 12,
-            13, 14, 15, 16,
-        );
+        let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtsepi32_epi8() {
-        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm256_maskz_cvtsepi32_epi8(0, a);
+    unsafe fn test_mm_maskz_cvtps_ph() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtsepi32_epi8(0b11111111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            9, 10, 11, 12,
-            13, 14, 15, 16,
-        );
+        let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
         assert_eq_m128i(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvtsepi32_epi8() {
-        let a = _mm_set_epi32(13, 14, 15, 16);
-        let r = _mm_cvtsepi32_epi8(a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            13, 14, 15, 16,
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvt_roundph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
         );
-        assert_eq_m128i(r, e);
+        let r = _mm512_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_set1_ps(1.);
+        assert_eq_m512(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtsepi32_epi8() {
-        let a = _mm_set_epi32(13, 14, 15, 16);
-        let src = _mm_set1_epi8(0);
-        let r = _mm_mask_cvtsepi32_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtsepi32_epi8(src, 0b00001111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            13, 14, 15, 16,
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvt_roundph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
         );
-        assert_eq_m128i(r, e);
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtsepi32_epi8() {
-        let a = _mm_set_epi32(13, 14, 15, 16);
-        let r = _mm_maskz_cvtsepi32_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtsepi32_epi8(0b00001111, a);
-        #[rustfmt::skip]
-        let e = _mm_set_epi8(
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            13, 14, 15, 16,
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvt_roundph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
         );
-        assert_eq_m128i(r, e);
+        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtusepi32_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MIN,
+    unsafe fn test_mm512_cvtph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
         );
-        let r = _mm512_cvtusepi32_epi16(a);
-        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
-        assert_eq_m256i(r, e);
+        let r = _mm512_cvtph_ps(a);
+        let e = _mm512_set1_ps(1.);
+        assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtusepi32_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MIN,
+    unsafe fn test_mm512_mask_cvtph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+        );
+        let src = _mm512_set1_ps(0.);
+        let r = _mm512_mask_cvtph_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_cvtph_ps(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         );
-        let src = _mm256_set1_epi16(-1);
-        let r = _mm512_mask_cvtusepi32_epi16(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtusepi32_epi16(src, 0b00000000_11111111, a);
-        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
-        assert_eq_m256i(r, e);
+        assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtusepi32_epi16() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MIN,
+    unsafe fn test_mm512_maskz_cvtph_ps() {
+        let a = _mm256_setr_epi64x(
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
+            4323521613979991040,
         );
-        let r = _mm512_maskz_cvtusepi32_epi16(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtusepi32_epi16(0b00000000_11111111, a);
-        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
-        assert_eq_m256i(r, e);
+        let r = _mm512_maskz_cvtph_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_cvtph_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvtusepi32_epi16() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm256_cvtusepi32_epi16(a);
-        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm256_mask_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let src = _mm256_set1_ps(0.);
+        let r = _mm256_mask_cvtph_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_cvtph_ps(src, 0b11111111, a);
+        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtusepi32_epi16() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let src = _mm_set1_epi16(0);
-        let r = _mm256_mask_cvtusepi32_epi16(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtusepi32_epi16(src, 0b11111111, a);
-        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm256_maskz_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let r = _mm256_maskz_cvtph_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_cvtph_ps(0b11111111, a);
+        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtusepi32_epi16() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = _mm256_maskz_cvtusepi32_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtusepi32_epi16(0b11111111, a);
-        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm_mask_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let src = _mm_set1_ps(0.);
+        let r = _mm_mask_cvtph_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_cvtph_ps(src, 0b00001111, a);
+        let e = _mm_setr_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvtusepi32_epi16() {
-        let a = _mm_set_epi32(5, 6, 7, 8);
-        let r = _mm_cvtusepi32_epi16(a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm_maskz_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let r = _mm_maskz_cvtph_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_cvtph_ps(0b00001111, a);
+        let e = _mm_setr_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtusepi32_epi16() {
-        let a = _mm_set_epi32(5, 6, 7, 8);
-        let src = _mm_set1_epi16(0);
-        let r = _mm_mask_cvtusepi32_epi16(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtusepi32_epi16(src, 0b00001111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtusepi32_epi16() {
-        let a = _mm_set_epi32(5, 6, 7, 8);
-        let r = _mm_maskz_cvtusepi32_epi16(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtusepi32_epi16(0b00001111, a);
-        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
-        assert_eq_m128i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtusepi32_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MIN,
+    unsafe fn test_mm512_maskz_cvtt_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_cvtusepi32_epi8(a);
-        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
-        assert_eq_m128i(r, e);
+        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtusepi32_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MIN,
+    unsafe fn test_mm512_cvtt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let src = _mm_set1_epi8(-1);
-        let r = _mm512_mask_cvtusepi32_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm512_mask_cvtusepi32_epi8(src, 0b00000000_11111111, a);
-        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
-        assert_eq_m128i(r, e);
+        let r = _mm512_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtusepi32_epi8() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(
-            0, 1, 2, 3,
-            4, 5, 6, 7,
-            8, 9, 10, 11,
-            12, 13, i32::MIN, i32::MIN,
+    unsafe fn test_mm512_mask_cvtt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_maskz_cvtusepi32_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm512_maskz_cvtusepi32_epi8(0b00000000_11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
-        assert_eq_m128i(r, e);
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvtusepi32_epi8() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
-        let r = _mm256_cvtusepi32_epi8(a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
-        assert_eq_m128i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtt_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtusepi32_epi8() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
-        let src = _mm_set1_epi8(0);
-        let r = _mm256_mask_cvtusepi32_epi8(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtusepi32_epi8(src, 0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
-        assert_eq_m128i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvttps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_cvttps_epi32(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvttps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let src = _mm512_set1_epi32(0);
+        let r = _mm512_mask_cvttps_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_cvttps_epi32(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvttps_epi32() {
+        let a = _mm512_setr_ps(
+            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
+        );
+        let r = _mm512_maskz_cvttps_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_cvttps_epi32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtusepi32_epi8() {
-        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
-        let r = _mm256_maskz_cvtusepi32_epi8(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtusepi32_epi8(0b11111111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm256_mask_cvttps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvttps_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvttps_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvtusepi32_epi8() {
-        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
-        let r = _mm_cvtusepi32_epi8(a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm256_maskz_cvttps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvttps_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvttps_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtusepi32_epi8() {
-        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
-        let src = _mm_set1_epi8(0);
-        let r = _mm_mask_cvtusepi32_epi8(src, 0, a);
+    unsafe fn test_mm_mask_cvttps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttps_epi32(src, 0, a);
         assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtusepi32_epi8(src, 0b00001111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        let r = _mm_mask_cvttps_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtusepi32_epi8() {
-        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
-        let r = _mm_maskz_cvtusepi32_epi8(0, a);
+    unsafe fn test_mm_maskz_cvttps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvttps_epi32(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtusepi32_epi8(0b00001111, a);
-        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        let r = _mm_maskz_cvttps_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvt_roundps_epi32() {
+    unsafe fn test_mm512_cvttps_epu32() {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        assert_eq_m512i(r, e);
-        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm512_cvttps_epu32(a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvt_roundps_epi32() {
+    unsafe fn test_mm512_mask_cvttps_epu32() {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
         let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a,
-        );
+        let r = _mm512_mask_cvttps_epu32(src, 0, a);
         assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b00000000_11111111,
-            a,
-        );
-        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm512_mask_cvttps_epu32(src, 0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvt_roundps_epi32() {
+    unsafe fn test_mm512_maskz_cvttps_epu32() {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a,
-        );
+        let r = _mm512_maskz_cvttps_epu32(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00000000_11111111,
-            a,
-        );
-        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm512_maskz_cvttps_epu32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_cvttps_epu32(a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvttps_epu32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvttps_epu32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvttps_epu32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvttps_epu32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_cvttps_epu32(a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttps_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttps_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvttps_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttps_epu32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvt_roundps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
-        assert_eq_m512i(r, e);
-        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm512_i32gather_ps() {
+        let mut arr = [0f32; 256];
+        for i in 0..256 {
+            arr[i] = i as f32;
+        }
+        // A multiplier of 4 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        let r = _mm512_i32gather_ps::<4>(index, arr.as_ptr() as *const u8);
+        #[rustfmt::skip]
+        assert_eq_m512(r, _mm512_setr_ps(0., 16., 32., 48., 64., 80., 96., 112.,
+                                         120., 128., 136., 144., 152., 160., 168., 176.));
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvt_roundps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a,
+    unsafe fn test_mm512_mask_i32gather_ps() {
+        let mut arr = [0f32; 256];
+        for i in 0..256 {
+            arr[i] = i as f32;
+        }
+        let src = _mm512_set1_ps(2.);
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        // A multiplier of 4 is word-addressing
+        let r = _mm512_mask_i32gather_ps::<4>(src, mask, index, arr.as_ptr() as *const u8);
+        #[rustfmt::skip]
+        assert_eq_m512(r, _mm512_setr_ps(2., 16., 2., 48., 2., 80., 2., 112.,
+                                         2., 128., 2., 144., 2., 160., 2., 176.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_epi32() {
+        let mut arr = [0i32; 256];
+        for i in 0..256 {
+            arr[i] = i as i32;
+        }
+        // A multiplier of 4 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        let r = _mm512_i32gather_epi32::<4>(index, arr.as_ptr() as *const u8);
+        #[rustfmt::skip]
+        assert_eq_m512i(r, _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                             120, 128, 136, 144, 152, 160, 168, 176));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_epi32() {
+        let mut arr = [0i32; 256];
+        for i in 0..256 {
+            arr[i] = i as i32;
+        }
+        let src = _mm512_set1_epi32(2);
+        let mask = 0b10101010_10101010;
+        let index = _mm512_setr_epi32(
+            0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
         );
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b00000000_11111111,
-            a,
+        // A multiplier of 4 is word-addressing
+        let r = _mm512_mask_i32gather_epi32::<4>(src, mask, index, arr.as_ptr() as *const u8);
+        assert_eq_m512i(
+            r,
+            _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112, 2, 144, 2, 176, 2, 208, 2, 240),
         );
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvt_roundps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a,
+    unsafe fn test_mm512_i32scatter_ps() {
+        let mut arr = [0f32; 256];
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
         );
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00000000_11111111,
-            a,
+        // A multiplier of 4 is word-addressing
+        _mm512_i32scatter_ps::<4>(arr.as_mut_ptr() as *mut u8, index, src);
+        let mut expected = [0f32; 256];
+        for i in 0..16 {
+            expected[i * 16] = (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_ps() {
+        let mut arr = [0f32; 256];
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
         );
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i32scatter_ps::<4>(arr.as_mut_ptr() as *mut u8, mask, index, src);
+        let mut expected = [0f32; 256];
+        for i in 0..8 {
+            expected[i * 32 + 16] = 2. * (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvt_roundepi32_ps() {
-        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
-        let e = _mm512_setr_ps(
-            0., -2., 2., -4., 4., -6., 6., -8., 8., 10., 10., 12., 12., 14., 14., 16.,
-        );
-        assert_eq_m512(r, e);
+    unsafe fn test_mm512_i32scatter_epi32() {
+        let mut arr = [0i32; 256];
+        #[rustfmt::skip]
+
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        // A multiplier of 4 is word-addressing
+        _mm512_i32scatter_epi32::<4>(arr.as_mut_ptr() as *mut u8, index, src);
+        let mut expected = [0i32; 256];
+        for i in 0..16 {
+            expected[i * 16] = (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvt_roundepi32_ps() {
-        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let src = _mm512_set1_ps(0.);
-        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a,
-        );
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b00000000_11111111,
-            a,
-        );
-        let e = _mm512_setr_ps(
-            0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
+    unsafe fn test_mm512_mask_i32scatter_epi32() {
+        let mut arr = [0i32; 256];
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i32scatter_epi32::<4>(arr.as_mut_ptr() as *mut u8, mask, index, src);
+        let mut expected = [0i32; 256];
+        for i in 0..8 {
+            expected[i * 32 + 16] = 2 * (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvt_roundepi32_ps() {
-        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a,
-        );
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00000000_11111111,
-            a,
-        );
-        let e = _mm512_setr_ps(
-            0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
+    unsafe fn test_mm512_cmplt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmplt_ps_mask(a, b);
+        assert_eq!(m, 0b00000101_00000101);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvt_roundepu32_ps() {
-        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+    unsafe fn test_mm512_mask_cmplt_ps_mask() {
         #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 4294967300., 2., 4294967300.,
-            4., 4294967300., 6., 4294967300.,
-            8., 10., 10., 12.,
-            12., 14., 14., 16.,
-        );
-        assert_eq_m512(r, e);
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmplt_ps_mask(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvt_roundepu32_ps() {
-        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let src = _mm512_set1_ps(0.);
-        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src, 0, a,
-        );
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            src,
-            0b00000000_11111111,
-            a,
-        );
+    unsafe fn test_mm512_cmpnlt_ps_mask() {
         #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 4294967300., 2., 4294967300.,
-            4., 4294967300., 6., 4294967300.,
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        assert_eq!(_mm512_cmpnlt_ps_mask(a, b), !_mm512_cmplt_ps_mask(a, b));
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvt_roundepu32_ps() {
-        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0, a,
-        );
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
-            0b00000000_11111111,
-            a,
-        );
+    unsafe fn test_mm512_mask_cmpnlt_ps_mask() {
         #[rustfmt::skip]
-        let e = _mm512_setr_ps(
-            0., 4294967300., 2., 4294967300.,
-            4., 4294967300., 6., 4294967300.,
-            0., 0., 0., 0.,
-            0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmpnlt_ps_mask(mask, a, b), 0b01111010_01111010);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvt_roundps_ph() {
-        let a = _mm512_set1_ps(1.);
-        let r = _mm512_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        assert_eq_m256i(r, e);
+    unsafe fn test_mm512_cmpnle_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmpnle_ps_mask(b, a);
+        assert_eq!(m, 0b00001101_00001101);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvt_roundps_ph() {
-        let a = _mm512_set1_ps(1.);
-        let src = _mm256_set1_epi16(0);
-        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
-        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
-        assert_eq_m256i(r, e);
+    unsafe fn test_mm512_mask_cmpnle_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmpnle_ps_mask(mask, b, a);
+        assert_eq!(r, 0b00000100_00000100);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvt_roundps_ph() {
-        let a = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
-        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
-        assert_eq_m256i(r, e);
+    unsafe fn test_mm512_cmple_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        assert_eq!(_mm512_cmple_ps_mask(a, b), 0b00100101_00100101);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvt_roundps_ph() {
-        let a = _mm256_set1_ps(1.);
-        let src = _mm_set1_epi16(0);
-        let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        assert_eq_m128i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmple_ps_mask(mask, a, b), 0b00100000_00100000);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvt_roundps_ph() {
-        let a = _mm256_set1_ps(1.);
-        let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        assert_eq_m128i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let m = _mm512_cmpeq_ps_mask(b, a);
+        assert_eq!(m, 0b11001101_11001101);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvt_roundps_ph() {
-        let a = _mm_set1_ps(1.);
-        let src = _mm_set1_epi16(0);
-        let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 0);
-        assert_eq_m128i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpeq_ps_mask(mask, b, a);
+        assert_eq!(r, 0b01001000_01001000);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvt_roundps_ph() {
-        let a = _mm_set1_ps(1.);
-        let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 0);
-        assert_eq_m128i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let m = _mm512_cmpneq_ps_mask(b, a);
+        assert_eq!(m, 0b00110010_00110010);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtps_ph() {
-        let a = _mm512_set1_ps(1.);
-        let r = _mm512_cvtps_ph::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        assert_eq_m256i(r, e);
+    unsafe fn test_mm512_mask_cmpneq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpneq_ps_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010)
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtps_ph() {
-        let a = _mm512_set1_ps(1.);
-        let src = _mm256_set1_epi16(0);
-        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
-        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
-        assert_eq_m256i(r, e);
+    unsafe fn test_mm512_cmp_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00000101_00000101);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtps_ph() {
-        let a = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
-        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
-        assert_eq_m256i(r, e);
+    unsafe fn test_mm512_mask_cmp_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtps_ph() {
-        let a = _mm256_set1_ps(1.);
-        let src = _mm_set1_epi16(0);
-        let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm256_cmp_ps_mask() {
+        let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm256_set1_ps(-1.);
+        let m = _mm256_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00000101);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtps_ph() {
-        let a = _mm256_set1_ps(1.);
-        let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm256_mask_cmp_ps_mask() {
+        let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm256_set1_ps(-1.);
+        let mask = 0b01100110;
+        let r = _mm256_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00000100);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtps_ph() {
-        let a = _mm_set1_ps(1.);
-        let src = _mm_set1_epi16(0);
-        let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 0);
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm_cmp_ps_mask() {
+        let a = _mm_set_ps(0., 1., -1., 13.);
+        let b = _mm_set1_ps(1.);
+        let m = _mm_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
+        assert_eq!(m, 0b00001010);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtps_ph() {
-        let a = _mm_set1_ps(1.);
-        let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a);
-        let e = _mm_setr_epi64x(4323521613979991040, 0);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvt_roundph_ps() {
-        let a = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        let r = _mm512_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_set1_ps(1.);
-        assert_eq_m512(r, e);
+    unsafe fn test_mm_mask_cmp_ps_mask() {
+        let a = _mm_set_ps(0., 1., -1., 13.);
+        let b = _mm_set1_ps(1.);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
+        assert_eq!(r, 0b00001010);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvt_roundph_ps() {
-        let a = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        let src = _mm512_set1_ps(0.);
-        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
-        let e = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
+    unsafe fn test_mm512_cmp_round_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(a, b);
+        assert_eq!(m, 0b00000101_00000101);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvt_roundph_ps() {
-        let a = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
-        let e = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
+    unsafe fn test_mm512_mask_cmp_round_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtph_ps() {
-        let a = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        let r = _mm512_cvtph_ps(a);
-        let e = _mm512_set1_ps(1.);
-        assert_eq_m512(r, e);
+    unsafe fn test_mm512_cmpord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let m = _mm512_cmpord_ps_mask(a, b);
+        assert_eq!(m, 0b00000101_00000101);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtph_ps() {
-        let a = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        let src = _mm512_set1_ps(0.);
-        let r = _mm512_mask_cvtph_ps(src, 0, a);
-        assert_eq_m512(r, src);
-        let r = _mm512_mask_cvtph_ps(src, 0b00000000_11111111, a);
-        let e = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
+    unsafe fn test_mm512_mask_cmpord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let mask = 0b11000011_11000011;
+        let m = _mm512_mask_cmpord_ps_mask(mask, a, b);
+        assert_eq!(m, 0b00000001_00000001);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtph_ps() {
-        let a = _mm256_setr_epi64x(
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-            4323521613979991040,
-        );
-        let r = _mm512_maskz_cvtph_ps(0, a);
-        assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvtph_ps(0b00000000_11111111, a);
-        let e = _mm512_setr_ps(
-            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
-        );
-        assert_eq_m512(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvtph_ps() {
-        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        let src = _mm256_set1_ps(0.);
-        let r = _mm256_mask_cvtph_ps(src, 0, a);
-        assert_eq_m256(r, src);
-        let r = _mm256_mask_cvtph_ps(src, 0b11111111, a);
-        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvtph_ps() {
-        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        let r = _mm256_maskz_cvtph_ps(0, a);
-        assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_cvtph_ps(0b11111111, a);
-        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
-        assert_eq_m256(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvtph_ps() {
-        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        let src = _mm_set1_ps(0.);
-        let r = _mm_mask_cvtph_ps(src, 0, a);
-        assert_eq_m128(r, src);
-        let r = _mm_mask_cvtph_ps(src, 0b00001111, a);
-        let e = _mm_setr_ps(1., 1., 1., 1.);
-        assert_eq_m128(r, e);
-    }
+    unsafe fn test_mm512_cmpunord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let m = _mm512_cmpunord_ps_mask(a, b);
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvtph_ps() {
-        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
-        let r = _mm_maskz_cvtph_ps(0, a);
-        assert_eq_m128(r, _mm_setzero_ps());
-        let r = _mm_maskz_cvtph_ps(0b00001111, a);
-        let e = _mm_setr_ps(1., 1., 1., 1.);
-        assert_eq_m128(r, e);
+        assert_eq!(m, 0b11111010_11111010);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtt_roundps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm512_mask_cmpunord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let mask = 0b00001111_00001111;
+        let m = _mm512_mask_cmpunord_ps_mask(mask, a, b);
+        assert_eq!(m, 0b000001010_00001010);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtt_roundps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm_cmp_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_cmp_ss_mask::<_CMP_GE_OS>(a, b);
+        assert_eq!(m, 1);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtt_roundps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm_mask_cmp_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b1, a, b);
+        assert_eq!(m, 1);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvtt_roundps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm_cmp_round_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b);
+        assert_eq!(m, 1);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvtt_roundps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm_mask_cmp_round_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b);
+        assert_eq!(m, 1);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvtt_roundps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm_cmp_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_cmp_sd_mask::<_CMP_GE_OS>(a, b);
+        assert_eq!(m, 1);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvttps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_cvttps_epi32(a);
-        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm_mask_cmp_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b1, a, b);
+        assert_eq!(m, 1);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvttps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvttps_epi32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvttps_epi32(src, 0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm_cmp_round_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b);
+        assert_eq!(m, 1);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvttps_epi32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_maskz_cvttps_epi32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvttps_epi32(0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
+    unsafe fn test_mm_mask_cmp_round_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b);
+        assert_eq!(m, 1);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvttps_epi32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let src = _mm256_set1_epi32(0);
-        let r = _mm256_mask_cvttps_epi32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvttps_epi32(src, 0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmplt_epu32_mask(a, b);
+        assert_eq!(m, 0b11001111_11001111);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvttps_epi32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let r = _mm256_maskz_cvttps_epi32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvttps_epi32(0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmplt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b01001010_01001010);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvttps_epi32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let src = _mm_set1_epi32(0);
-        let r = _mm_mask_cvttps_epi32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvttps_epi32(src, 0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm256_cmplt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmplt_epu32_mask(a, b);
+        assert_eq!(r, 0b10000000);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvttps_epi32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let r = _mm_maskz_cvttps_epi32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvttps_epi32(0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm256_mask_cmplt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmplt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b10000000);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cvttps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_cvttps_epu32(a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(r, e);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmplt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmplt_epu32_mask(a, b);
+        assert_eq!(r, 0b00001000);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cvttps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvttps_epu32(src, 0, a);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvttps_epu32(src, 0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmplt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmplt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00001000);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_maskz_cvttps_epu32() {
-        let a = _mm512_setr_ps(
-            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
-        );
-        let r = _mm512_maskz_cvttps_epu32(0, a);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvttps_epu32(0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cvttps_epu32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let r = _mm256_cvttps_epu32(a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
+    unsafe fn test_mm512_cmpgt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmpgt_epu32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cvttps_epu32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let src = _mm256_set1_epi32(0);
-        let r = _mm256_mask_cvttps_epu32(src, 0, a);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_cvttps_epu32(src, 0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpgt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpgt_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_maskz_cvttps_epu32() {
-        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
-        let r = _mm256_maskz_cvttps_epu32(0, a);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_cvttps_epu32(0b11111111, a);
-        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m256i(r, e);
+    unsafe fn test_mm256_cmpgt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmpgt_epu32_mask(a, b);
+        assert_eq!(r, 0b00111111);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cvttps_epu32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let r = _mm_cvttps_epu32(a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm256_mask_cmpgt_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpgt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00111111);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cvttps_epu32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let src = _mm_set1_epi32(0);
-        let r = _mm_mask_cvttps_epu32(src, 0, a);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_cvttps_epu32(src, 0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm_cmpgt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmpgt_epu32_mask(a, b);
+        assert_eq!(r, 0b00000011);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_maskz_cvttps_epu32() {
-        let a = _mm_set_ps(12., 13.5, 14., 15.5);
-        let r = _mm_maskz_cvttps_epu32(0, a);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_cvttps_epu32(0b00001111, a);
-        let e = _mm_set_epi32(12, 13, 14, 15);
-        assert_eq_m128i(r, e);
+    unsafe fn test_mm_mask_cmpgt_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpgt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00000011);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_i32gather_ps() {
-        let mut arr = [0f32; 256];
-        for i in 0..256 {
-            arr[i] = i as f32;
-        }
-        // A multiplier of 4 is word-addressing
-        #[rustfmt::skip]
-        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      120, 128, 136, 144, 152, 160, 168, 176);
-        let r = _mm512_i32gather_ps::<4>(index, arr.as_ptr() as *const u8);
+    unsafe fn test_mm512_cmple_epu32_mask() {
         #[rustfmt::skip]
-        assert_eq_m512(r, _mm512_setr_ps(0., 16., 32., 48., 64., 80., 96., 112.,
-                                         120., 128., 136., 144., 152., 160., 168., 176.));
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmple_epu32_mask(a, b),
+            !_mm512_cmpgt_epu32_mask(a, b)
+        )
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_i32gather_ps() {
-        let mut arr = [0f32; 256];
-        for i in 0..256 {
-            arr[i] = i as f32;
-        }
-        let src = _mm512_set1_ps(2.);
-        let mask = 0b10101010_10101010;
-        #[rustfmt::skip]
-        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      120, 128, 136, 144, 152, 160, 168, 176);
-        // A multiplier of 4 is word-addressing
-        let r = _mm512_mask_i32gather_ps::<4>(src, mask, index, arr.as_ptr() as *const u8);
+    unsafe fn test_mm512_mask_cmple_epu32_mask() {
         #[rustfmt::skip]
-        assert_eq_m512(r, _mm512_setr_ps(2., 16., 2., 48., 2., 80., 2., 112.,
-                                         2., 128., 2., 144., 2., 160., 2., 176.));
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(
+            _mm512_mask_cmple_epu32_mask(mask, a, b),
+            0b01111010_01111010
+        );
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_i32gather_epi32() {
-        let mut arr = [0i32; 256];
-        for i in 0..256 {
-            arr[i] = i as i32;
-        }
-        // A multiplier of 4 is word-addressing
-        #[rustfmt::skip]
-        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      120, 128, 136, 144, 152, 160, 168, 176);
-        let r = _mm512_i32gather_epi32::<4>(index, arr.as_ptr() as *const u8);
-        #[rustfmt::skip]
-        assert_eq_m512i(r, _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                             120, 128, 136, 144, 152, 160, 168, 176));
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmple_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmple_epu32_mask(a, b);
+        assert_eq!(r, 0b11000000)
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_i32gather_epi32() {
-        let mut arr = [0i32; 256];
-        for i in 0..256 {
-            arr[i] = i as i32;
-        }
-        let src = _mm512_set1_epi32(2);
-        let mask = 0b10101010_10101010;
-        let index = _mm512_setr_epi32(
-            0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
-        );
-        // A multiplier of 4 is word-addressing
-        let r = _mm512_mask_i32gather_epi32::<4>(src, mask, index, arr.as_ptr() as *const u8);
-        assert_eq_m512i(
-            r,
-            _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112, 2, 144, 2, 176, 2, 208, 2, 240),
-        );
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmple_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b11000000)
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_i32scatter_ps() {
-        let mut arr = [0f32; 256];
-        #[rustfmt::skip]
-        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      128, 144, 160, 176, 192, 208, 224, 240);
-        let src = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        // A multiplier of 4 is word-addressing
-        _mm512_i32scatter_ps::<4>(arr.as_mut_ptr() as *mut u8, index, src);
-        let mut expected = [0f32; 256];
-        for i in 0..16 {
-            expected[i * 16] = (i + 1) as f32;
-        }
-        assert_eq!(&arr[..], &expected[..],);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmple_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmple_epu32_mask(a, b);
+        assert_eq!(r, 0b00001100)
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_i32scatter_ps() {
-        let mut arr = [0f32; 256];
-        let mask = 0b10101010_10101010;
-        #[rustfmt::skip]
-        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      128, 144, 160, 176, 192, 208, 224, 240);
-        let src = _mm512_setr_ps(
-            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
-        );
-        // A multiplier of 4 is word-addressing
-        _mm512_mask_i32scatter_ps::<4>(arr.as_mut_ptr() as *mut u8, mask, index, src);
-        let mut expected = [0f32; 256];
-        for i in 0..8 {
-            expected[i * 32 + 16] = 2. * (i + 1) as f32;
-        }
-        assert_eq!(&arr[..], &expected[..],);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmple_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00001100)
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_i32scatter_epi32() {
-        let mut arr = [0i32; 256];
+    unsafe fn test_mm512_cmpge_epu32_mask() {
         #[rustfmt::skip]
-
-        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      128, 144, 160, 176, 192, 208, 224, 240);
-        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        // A multiplier of 4 is word-addressing
-        _mm512_i32scatter_epi32::<4>(arr.as_mut_ptr() as *mut u8, index, src);
-        let mut expected = [0i32; 256];
-        for i in 0..16 {
-            expected[i * 16] = (i + 1) as i32;
-        }
-        assert_eq!(&arr[..], &expected[..],);
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmpge_epu32_mask(a, b),
+            !_mm512_cmplt_epu32_mask(a, b)
+        )
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_i32scatter_epi32() {
-        let mut arr = [0i32; 256];
-        let mask = 0b10101010_10101010;
+    unsafe fn test_mm512_mask_cmpge_epu32_mask() {
         #[rustfmt::skip]
-        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      128, 144, 160, 176, 192, 208, 224, 240);
-        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        // A multiplier of 4 is word-addressing
-        _mm512_mask_i32scatter_epi32::<4>(arr.as_mut_ptr() as *mut u8, mask, index, src);
-        let mut expected = [0i32; 256];
-        for i in 0..8 {
-            expected[i * 32 + 16] = 2 * (i + 1) as i32;
-        }
-        assert_eq!(&arr[..], &expected[..],);
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmpge_epu32_mask(mask, a, b), 0b01100000_0110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpge_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200);
+        let b = _mm256_set1_epi32(1);
+        let r = _mm256_cmpge_epu32_mask(a, b);
+        assert_eq!(r, 0b01111111)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200);
+        let b = _mm256_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpge_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b01111111)
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmplt_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let m = _mm512_cmplt_ps_mask(a, b);
-        assert_eq!(m, 0b00000101_00000101);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpge_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let r = _mm_cmpge_epu32_mask(a, b);
+        assert_eq!(r, 0b00000111)
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmplt_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let mask = 0b01100110_01100110;
-        let r = _mm512_mask_cmplt_ps_mask(mask, a, b);
-        assert_eq!(r, 0b00000100_00000100);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpge_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b00000111)
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpnlt_ps_mask() {
+    unsafe fn test_mm512_cmpeq_epu32_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        assert_eq!(_mm512_cmpnlt_ps_mask(a, b), !_mm512_cmplt_ps_mask(a, b));
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpeq_epu32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpnlt_ps_mask() {
+    unsafe fn test_mm512_mask_cmpeq_epu32_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
         let mask = 0b01111010_01111010;
-        assert_eq!(_mm512_mask_cmpnlt_ps_mask(mask, a, b), 0b01111010_01111010);
+        let r = _mm512_mask_cmpeq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpnle_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let m = _mm512_cmpnle_ps_mask(b, a);
-        assert_eq!(m, 0b00001101_00001101);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm256_cmpeq_epu32_mask(b, a);
+        assert_eq!(m, 0b11001111);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpnle_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let mask = 0b01100110_01100110;
-        let r = _mm512_mask_cmpnle_ps_mask(mask, b, a);
-        assert_eq!(r, 0b00000100_00000100);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm256_mask_cmpeq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmple_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        assert_eq!(_mm512_cmple_ps_mask(a, b), 0b00100101_00100101);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpeq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let m = _mm_cmpeq_epu32_mask(b, a);
+        assert_eq!(m, 0b00001100);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmple_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let mask = 0b01111010_01111010;
-        assert_eq!(_mm512_mask_cmple_ps_mask(mask, a, b), 0b00100000_00100000);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpeq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00001100);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpeq_ps_mask() {
+    unsafe fn test_mm512_cmpneq_epu32_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
         #[rustfmt::skip]
-        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
-        let m = _mm512_cmpeq_ps_mask(b, a);
-        assert_eq!(m, 0b11001101_11001101);
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpneq_epu32_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epu32_mask(b, a));
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpeq_ps_mask() {
+    unsafe fn test_mm512_mask_cmpneq_epu32_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
         #[rustfmt::skip]
-        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
         let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmpeq_ps_mask(mask, b, a);
-        assert_eq!(r, 0b01001000_01001000);
+        let r = _mm512_mask_cmpneq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100);
+        let r = _mm256_cmpneq_epu32_mask(b, a);
+        assert_eq!(r, 0b00110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpneq_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpneq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00110000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpneq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let r = _mm_cmpneq_epu32_mask(b, a);
+        assert_eq!(r, 0b00000011);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpneq_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpneq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00000011);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpneq_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+    unsafe fn test_mm512_cmp_epu32_mask() {
         #[rustfmt::skip]
-        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
-        let m = _mm512_cmpneq_ps_mask(b, a);
-        assert_eq!(m, 0b00110010_00110010);
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11001111_11001111);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpneq_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
+    unsafe fn test_mm512_mask_cmp_epu32_mask() {
         #[rustfmt::skip]
-        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
-                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
         let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmpneq_ps_mask(mask, b, a);
-        assert_eq!(r, 0b00110010_00110010)
+        let r = _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmp_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let m = _mm256_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmp_epu32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmp_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, i32::MAX);
+        let b = _mm_set1_epi32(1);
+        let m = _mm_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00001000);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmp_epu32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, i32::MAX);
+        let b = _mm_set1_epi32(1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00001000);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmp_ps_mask() {
+    unsafe fn test_mm512_cmplt_epi32_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let m = _mm512_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmplt_epi32_mask(a, b);
         assert_eq!(m, 0b00000101_00000101);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmp_ps_mask() {
+    unsafe fn test_mm512_mask_cmplt_epi32_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
         let mask = 0b01100110_01100110;
-        let r = _mm512_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
+        let r = _mm512_mask_cmplt_epi32_mask(mask, a, b);
         assert_eq!(r, 0b00000100_00000100);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmp_ps_mask() {
-        let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
-        let b = _mm256_set1_ps(-1.);
-        let m = _mm256_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
-        assert_eq!(m, 0b00000101);
+    unsafe fn test_mm256_cmplt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmplt_epi32_mask(a, b);
+        assert_eq!(r, 0b00000101);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_ps_mask() {
-        let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
-        let b = _mm256_set1_ps(-1.);
-        let mask = 0b01100110;
-        let r = _mm256_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
-        assert_eq!(r, 0b00000100);
+    unsafe fn test_mm256_mask_cmplt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmplt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000101);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmp_ps_mask() {
-        let a = _mm_set_ps(0., 1., -1., 13.);
-        let b = _mm_set1_ps(1.);
-        let m = _mm_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
-        assert_eq!(m, 0b00001010);
+    unsafe fn test_mm_cmplt_epi32_mask() {
+        let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmplt_epi32_mask(a, b);
+        assert_eq!(r, 0b00000101);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmp_ps_mask() {
-        let a = _mm_set_ps(0., 1., -1., 13.);
-        let b = _mm_set1_ps(1.);
+    unsafe fn test_mm_mask_cmplt_epi32_mask() {
+        let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100);
+        let b = _mm_set1_epi32(-1);
         let mask = 0b11111111;
-        let r = _mm_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
-        assert_eq!(r, 0b00001010);
+        let r = _mm_mask_cmplt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000101);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmp_round_ps_mask() {
+    unsafe fn test_mm512_cmpgt_epi32_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
-        let m = _mm512_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(a, b);
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmpgt_epi32_mask(b, a);
         assert_eq!(m, 0b00000101_00000101);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmp_round_ps_mask() {
+    unsafe fn test_mm512_mask_cmpgt_epi32_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
-        let b = _mm512_set1_ps(-1.);
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
         let mask = 0b01100110_01100110;
-        let r = _mm512_mask_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(mask, a, b);
+        let r = _mm512_mask_cmpgt_epi32_mask(mask, b, a);
         assert_eq!(r, 0b00000100_00000100);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpord_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
-                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
-        #[rustfmt::skip]
-        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
-                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
-        let m = _mm512_cmpord_ps_mask(a, b);
-        assert_eq!(m, 0b00000101_00000101);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpgt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmpgt_epi32_mask(a, b);
+        assert_eq!(r, 0b11011010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpgt_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpgt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b11011010);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpgt_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmpgt_epi32_mask(a, b);
+        assert_eq!(r, 0b00001101);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpgt_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpgt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00001101);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpord_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
-                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+    unsafe fn test_mm512_cmple_epi32_mask() {
         #[rustfmt::skip]
-        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
-                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
-        let mask = 0b11000011_11000011;
-        let m = _mm512_mask_cmpord_ps_mask(mask, a, b);
-        assert_eq!(m, 0b00000001_00000001);
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmple_epi32_mask(a, b),
+            !_mm512_cmpgt_epi32_mask(a, b)
+        )
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpunord_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
-                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+    unsafe fn test_mm512_mask_cmple_epi32_mask() {
         #[rustfmt::skip]
-        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
-                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
-        let m = _mm512_cmpunord_ps_mask(a, b);
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmple_epi32_mask(mask, a, b), 0b01100000_0110000);
+    }
 
-        assert_eq!(m, 0b11111010_11111010);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmple_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmple_epi32_mask(a, b);
+        assert_eq!(r, 0b00100101)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmple_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00100101)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmple_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 200);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmple_epi32_mask(a, b);
+        assert_eq!(r, 0b00000010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 200);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmple_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000010)
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpunord_ps_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
-                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+    unsafe fn test_mm512_cmpge_epi32_mask() {
         #[rustfmt::skip]
-        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
-                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
-        let mask = 0b00001111_00001111;
-        let m = _mm512_mask_cmpunord_ps_mask(mask, a, b);
-        assert_eq!(m, 0b000001010_00001010);
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(
+            _mm512_cmpge_epi32_mask(a, b),
+            !_mm512_cmplt_epi32_mask(a, b)
+        )
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cmp_ss_mask() {
-        let a = _mm_setr_ps(2., 1., 1., 1.);
-        let b = _mm_setr_ps(1., 2., 2., 2.);
-        let m = _mm_cmp_ss_mask::<_CMP_GE_OS>(a, b);
-        assert_eq!(m, 1);
+    unsafe fn test_mm512_mask_cmpge_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(
+            _mm512_mask_cmpge_epi32_mask(mask, a, b),
+            0b01111010_01111010
+        );
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_cmp_ss_mask() {
-        let a = _mm_setr_ps(2., 1., 1., 1.);
-        let b = _mm_setr_ps(1., 2., 2., 2.);
-        let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b10, a, b);
-        assert_eq!(m, 0);
-        let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b1, a, b);
-        assert_eq!(m, 1);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpge_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_cmpge_epi32_mask(a, b);
+        assert_eq!(r, 0b11111010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm256_mask_cmpge_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b11111010)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpge_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set1_epi32(-1);
+        let r = _mm_cmpge_epi32_mask(a, b);
+        assert_eq!(r, 0b00001111)
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
+        let b = _mm_set1_epi32(-1);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpge_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00001111)
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cmp_round_ss_mask() {
-        let a = _mm_setr_ps(2., 1., 1., 1.);
-        let b = _mm_setr_ps(1., 2., 2., 2.);
-        let m = _mm_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b);
-        assert_eq!(m, 1);
+    unsafe fn test_mm512_cmpeq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpeq_epi32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_cmp_round_ss_mask() {
-        let a = _mm_setr_ps(2., 1., 1., 1.);
-        let b = _mm_setr_ps(1., 2., 2., 2.);
-        let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b);
-        assert_eq!(m, 0);
-        let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b);
-        assert_eq!(m, 1);
+    unsafe fn test_mm512_mask_cmpeq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpeq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cmp_sd_mask() {
-        let a = _mm_setr_pd(2., 1.);
-        let b = _mm_setr_pd(1., 2.);
-        let m = _mm_cmp_sd_mask::<_CMP_GE_OS>(a, b);
-        assert_eq!(m, 1);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm256_cmpeq_epi32_mask(b, a);
+        assert_eq!(m, 0b11001111);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_cmp_sd_mask() {
-        let a = _mm_setr_pd(2., 1.);
-        let b = _mm_setr_pd(1., 2.);
-        let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b10, a, b);
-        assert_eq!(m, 0);
-        let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b1, a, b);
-        assert_eq!(m, 1);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm256_mask_cmpeq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cmp_round_sd_mask() {
-        let a = _mm_setr_pd(2., 1.);
-        let b = _mm_setr_pd(1., 2.);
-        let m = _mm_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b);
-        assert_eq!(m, 1);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cmpeq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let m = _mm_cmpeq_epi32_mask(b, a);
+        assert_eq!(m, 0b00001100);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_mask_cmp_round_sd_mask() {
-        let a = _mm_setr_pd(2., 1.);
-        let b = _mm_setr_pd(1., 2.);
-        let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b);
-        assert_eq!(m, 0);
-        let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b);
-        assert_eq!(m, 1);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let mask = 0b11111111;
+        let r = _mm_mask_cmpeq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00001100);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmplt_epu32_mask() {
+    unsafe fn test_mm512_cmpneq_epi32_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let m = _mm512_cmplt_epu32_mask(a, b);
-        assert_eq!(m, 0b11001111_11001111);
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpneq_epi32_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epi32_mask(b, a));
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmplt_epu32_mask() {
+    unsafe fn test_mm512_mask_cmpneq_epi32_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
         let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmplt_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b01001010_01001010);
+        let r = _mm512_mask_cmpneq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010)
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmplt_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_cmplt_epu32_mask(a, b);
-        assert_eq!(r, 0b10000000);
+    unsafe fn test_mm256_cmpneq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm256_cmpneq_epi32_mask(b, a);
+        assert_eq!(m, !_mm256_cmpeq_epi32_mask(b, a));
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmplt_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99);
-        let b = _mm256_set1_epi32(1);
+    unsafe fn test_mm256_mask_cmpneq_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
+        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
         let mask = 0b11111111;
-        let r = _mm256_mask_cmplt_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b10000000);
+        let r = _mm256_mask_cmpneq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00110011)
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmplt_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_cmplt_epu32_mask(a, b);
-        assert_eq!(r, 0b00001000);
+    unsafe fn test_mm_cmpneq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
+        let r = _mm_cmpneq_epi32_mask(b, a);
+        assert_eq!(r, 0b00000011)
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmplt_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
-        let b = _mm_set1_epi32(1);
+    unsafe fn test_mm_mask_cmpneq_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
+        let b = _mm_set_epi32(0, 1, 13, 42);
         let mask = 0b11111111;
-        let r = _mm_mask_cmplt_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b00001000);
+        let r = _mm_mask_cmpneq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00000011)
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpgt_epu32_mask() {
+    unsafe fn test_mm512_cmp_epi32_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
         let b = _mm512_set1_epi32(-1);
-        let m = _mm512_cmpgt_epu32_mask(b, a);
-        assert_eq!(m, 0b11001111_11001111);
+        let m = _mm512_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00000101_00000101);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpgt_epu32_mask() {
+    unsafe fn test_mm512_mask_cmp_epi32_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
         let b = _mm512_set1_epi32(-1);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmpgt_epu32_mask(mask, b, a);
-        assert_eq!(r, 0b01001010_01001010);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpgt_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_cmpgt_epu32_mask(a, b);
-        assert_eq!(r, 0b00111111);
+    unsafe fn test_mm256_cmp_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let m = _mm256_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00000101);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpgt_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101);
-        let b = _mm256_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmpgt_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b00111111);
+    unsafe fn test_mm256_mask_cmp_epi32_mask() {
+        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm256_set1_epi32(-1);
+        let mask = 0b01100110;
+        let r = _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00000100);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpgt_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+    unsafe fn test_mm_cmp_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
         let b = _mm_set1_epi32(1);
-        let r = _mm_cmpgt_epu32_mask(a, b);
-        assert_eq!(r, 0b00000011);
+        let m = _mm_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
+        assert_eq!(m, 0b00001010);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpgt_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
+    unsafe fn test_mm_mask_cmp_epi32_mask() {
+        let a = _mm_set_epi32(0, 1, -1, 13);
         let b = _mm_set1_epi32(1);
         let mask = 0b11111111;
-        let r = _mm_mask_cmpgt_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b00000011);
+        let r = _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
+        assert_eq!(r, 0b00001010);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmple_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        assert_eq!(
-            _mm512_cmple_epu32_mask(a, b),
-            !_mm512_cmpgt_epu32_mask(a, b)
+    unsafe fn test_mm512_set_epi8() {
+        let r = _mm512_set1_epi8(2);
+        assert_eq_m512i(
+            r,
+            _mm512_set_epi8(
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                2, 2, 2, 2, 2, 2, 2, 2,
+            ),
         )
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmple_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01111010_01111010;
-        assert_eq!(
-            _mm512_mask_cmple_epu32_mask(mask, a, b),
-            0b01111010_01111010
-        );
+    unsafe fn test_mm512_set_epi16() {
+        let r = _mm512_set1_epi16(2);
+        assert_eq_m512i(
+            r,
+            _mm512_set_epi16(
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                2, 2, 2, 2,
+            ),
+        )
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmple_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_cmple_epu32_mask(a, b);
-        assert_eq!(r, 0b11000000)
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_epi32() {
+        let r = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(
+            r,
+            _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
+        )
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmple_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101);
-        let b = _mm256_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmple_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b11000000)
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_epi32() {
+        let r = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(
+            r,
+            _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
+        )
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmple_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_cmple_epu32_mask(a, b);
-        assert_eq!(r, 0b00001100)
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_epi8() {
+        let r = _mm512_set_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2, 2, 2, 2,
+        );
+        assert_eq_m512i(r, _mm512_set1_epi8(2));
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmple_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
-        let b = _mm_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmple_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b00001100)
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_epi16() {
+        let r = _mm512_set_epi16(
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2,
+        );
+        assert_eq_m512i(r, _mm512_set1_epi16(2));
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpge_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        assert_eq!(
-            _mm512_cmpge_epu32_mask(a, b),
-            !_mm512_cmplt_epu32_mask(a, b)
-        )
+    unsafe fn test_mm512_set1_epi32() {
+        let r = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, _mm512_set1_epi32(2));
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpge_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01111010_01111010;
-        assert_eq!(_mm512_mask_cmpge_epu32_mask(mask, a, b), 0b01100000_0110000);
+    unsafe fn test_mm512_setzero_si512() {
+        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_si512());
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpge_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200);
-        let b = _mm256_set1_epi32(1);
-        let r = _mm256_cmpge_epu32_mask(a, b);
-        assert_eq!(r, 0b01111111)
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_epi32() {
+        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_epi32());
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpge_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200);
-        let b = _mm256_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmpge_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b01111111)
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_ps() {
+        let r = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(
+            r,
+            _mm512_set_ps(
+                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+            ),
+        )
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpge_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
-        let b = _mm_set1_epi32(1);
-        let r = _mm_cmpge_epu32_mask(a, b);
-        assert_eq!(r, 0b00000111)
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_ps() {
+        let r = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(
+            r,
+            _mm512_setr_ps(
+                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+            ),
+        )
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpge_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
-        let b = _mm_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpge_epu32_mask(mask, a, b);
-        assert_eq!(r, 0b00000111)
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_ps() {
+        #[rustfmt::skip]
+        let expected = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2.,
+                                     2., 2., 2., 2., 2., 2., 2., 2.);
+        assert_eq_m512(expected, _mm512_set1_ps(2.));
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpeq_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let m = _mm512_cmpeq_epu32_mask(b, a);
-        assert_eq!(m, 0b11001111_11001111);
+    unsafe fn test_mm512_set4_epi32() {
+        let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
+        assert_eq_m512i(r, _mm512_set4_epi32(4, 3, 2, 1));
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpeq_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmpeq_epu32_mask(mask, b, a);
-        assert_eq!(r, 0b01001010_01001010);
+    unsafe fn test_mm512_set4_ps() {
+        let r = _mm512_set_ps(
+            4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
+        );
+        assert_eq_m512(r, _mm512_set4_ps(4., 3., 2., 1.));
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpeq_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let m = _mm256_cmpeq_epu32_mask(b, a);
-        assert_eq!(m, 0b11001111);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr4_epi32() {
+        let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
+        assert_eq_m512i(r, _mm512_setr4_epi32(1, 2, 3, 4));
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpeq_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let mask = 0b01111010;
-        let r = _mm256_mask_cmpeq_epu32_mask(mask, b, a);
-        assert_eq!(r, 0b01001010);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr4_ps() {
+        let r = _mm512_set_ps(
+            4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
+        );
+        assert_eq_m512(r, _mm512_setr4_ps(1., 2., 3., 4.));
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpeq_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let m = _mm_cmpeq_epu32_mask(b, a);
-        assert_eq!(m, 0b00001100);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_ps() {
+        assert_eq_m512(_mm512_setzero_ps(), _mm512_set1_ps(0.));
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpeq_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpeq_epu32_mask(mask, b, a);
-        assert_eq!(r, 0b00001100);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero() {
+        assert_eq_m512(_mm512_setzero(), _mm512_set1_ps(0.));
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpneq_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let m = _mm512_cmpneq_epu32_mask(b, a);
-        assert_eq!(m, !_mm512_cmpeq_epu32_mask(b, a));
+    unsafe fn test_mm512_loadu_pd() {
+        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
+        let p = a.as_ptr();
+        let r = _mm512_loadu_pd(black_box(p));
+        let e = _mm512_setr_pd(4., 3., 2., 5., 8., 9., 64., 50.);
+        assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpneq_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmpneq_epu32_mask(mask, b, a);
-        assert_eq!(r, 0b00110010_00110010);
+    unsafe fn test_mm512_storeu_pd() {
+        let a = _mm512_set1_pd(9.);
+        let mut r = _mm512_undefined_pd();
+        _mm512_storeu_pd(&mut r as *mut _ as *mut f64, a);
+        assert_eq_m512d(r, a);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpneq_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100);
-        let r = _mm256_cmpneq_epu32_mask(b, a);
-        assert_eq!(r, 0b00110000);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_loadu_ps() {
+        let a = &[
+            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+        ];
+        let p = a.as_ptr();
+        let r = _mm512_loadu_ps(black_box(p));
+        let e = _mm512_setr_ps(
+            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
+        );
+        assert_eq_m512(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpneq_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmpneq_epu32_mask(mask, b, a);
-        assert_eq!(r, 0b00110000);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_storeu_ps() {
+        let a = _mm512_set1_ps(9.);
+        let mut r = _mm512_undefined_ps();
+        _mm512_storeu_ps(&mut r as *mut _ as *mut f32, a);
+        assert_eq_m512(r, a);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpneq_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let r = _mm_cmpneq_epu32_mask(b, a);
-        assert_eq!(r, 0b00000011);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_epi32() {
+        let src = _mm512_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_loadu_epi32(src, m, black_box(p));
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpneq_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpneq_epu32_mask(mask, b, a);
-        assert_eq!(r, 0b00000011);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_epi32() {
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_loadu_epi32(m, black_box(p));
+        let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16);
+        assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmp_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let m = _mm512_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11001111_11001111);
+    unsafe fn test_mm512_mask_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16], // 64 bytes
+        }
+        let src = _mm512_set1_epi32(42);
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_load_epi32(src, m, black_box(p));
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmp_epu32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01001010_01001010);
+    unsafe fn test_mm512_maskz_load_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_load_epi32(m, black_box(p));
+        let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16);
+        assert_eq_m512i(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmp_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let m = _mm256_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11001111);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_epi32() {
+        let mut r = [42_i32; 16];
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let m = 0b11101000_11001010;
+        _mm512_mask_storeu_epi32(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(_mm512_loadu_epi32(r.as_ptr()), e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_epu32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b11001111);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 16],
+        }
+        let mut r = Align { data: [42; 16] };
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let m = 0b11101000_11001010;
+        _mm512_mask_store_epi32(r.data.as_mut_ptr(), m, a);
+        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
+        assert_eq_m512i(_mm512_load_epi32(r.data.as_ptr()), e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmp_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, i32::MAX);
-        let b = _mm_set1_epi32(1);
-        let m = _mm_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b00001000);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_epi64() {
+        let src = _mm512_set1_epi64(42);
+        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_loadu_epi64(src, m, black_box(p));
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmp_epu32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, i32::MAX);
-        let b = _mm_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b00001000);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_epi64() {
+        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_loadu_epi64(m, black_box(p));
+        let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmplt_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let m = _mm512_cmplt_epi32_mask(a, b);
-        assert_eq!(m, 0b00000101_00000101);
+    unsafe fn test_mm512_mask_load_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 8], // 64 bytes
+        }
+        let src = _mm512_set1_epi64(42);
+        let a = Align {
+            data: [1_i64, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_load_epi64(src, m, black_box(p));
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmplt_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01100110_01100110;
-        let r = _mm512_mask_cmplt_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b00000100_00000100);
+    unsafe fn test_mm512_maskz_load_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 8], // 64 bytes
+        }
+        let a = Align {
+            data: [1_i64, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_load_epi64(m, black_box(p));
+        let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m512i(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmplt_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let r = _mm256_cmplt_epi32_mask(a, b);
-        assert_eq!(r, 0b00000101);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_epi64() {
+        let mut r = [42_i64; 8];
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        _mm512_mask_storeu_epi64(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(_mm512_loadu_epi64(r.as_ptr()), e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmplt_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmplt_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b00000101);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_epi64() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i64; 8],
+        }
+        let mut r = Align { data: [42; 8] };
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        let p = r.data.as_mut_ptr();
+        _mm512_mask_store_epi64(p, m, a);
+        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m512i(_mm512_load_epi64(r.data.as_ptr()), e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmplt_epi32_mask() {
-        let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100);
-        let b = _mm_set1_epi32(-1);
-        let r = _mm_cmplt_epi32_mask(a, b);
-        assert_eq!(r, 0b00000101);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_ps() {
+        let src = _mm512_set1_ps(42.0);
+        let a = &[
+            1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+            16.0,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_loadu_ps(src, m, black_box(p));
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmplt_epi32_mask() {
-        let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100);
-        let b = _mm_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmplt_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b00000101);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_ps() {
+        let a = &[
+            1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+            16.0,
+        ];
+        let p = a.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_loadu_ps(m, black_box(p));
+        let e = _mm512_setr_ps(
+            0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpgt_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let m = _mm512_cmpgt_epi32_mask(b, a);
-        assert_eq!(m, 0b00000101_00000101);
+    unsafe fn test_mm512_mask_load_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16], // 64 bytes
+        }
+        let src = _mm512_set1_ps(42.0);
+        let a = Align {
+            data: [
+                1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
+                15.0, 16.0,
+            ],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_mask_load_ps(src, m, black_box(p));
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpgt_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01100110_01100110;
-        let r = _mm512_mask_cmpgt_epi32_mask(mask, b, a);
-        assert_eq!(r, 0b00000100_00000100);
+    unsafe fn test_mm512_maskz_load_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16], // 64 bytes
+        }
+        let a = Align {
+            data: [
+                1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
+                15.0, 16.0,
+            ],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11101000_11001010;
+        let r = _mm512_maskz_load_ps(m, black_box(p));
+        let e = _mm512_setr_ps(
+            0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m512(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpgt_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let r = _mm256_cmpgt_epi32_mask(a, b);
-        assert_eq!(r, 0b11011010);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_storeu_ps() {
+        let mut r = [42_f32; 16];
+        let a = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let m = 0b11101000_11001010;
+        _mm512_mask_storeu_ps(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(_mm512_loadu_ps(r.as_ptr()), e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpgt_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmpgt_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b11011010);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_ps() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f32; 16],
+        }
+        let mut r = Align { data: [42.0; 16] };
+        let a = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let m = 0b11101000_11001010;
+        _mm512_mask_store_ps(r.data.as_mut_ptr(), m, a);
+        let e = _mm512_setr_ps(
+            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
+            16.0,
+        );
+        assert_eq_m512(_mm512_load_ps(r.data.as_ptr()), e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpgt_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set1_epi32(-1);
-        let r = _mm_cmpgt_epi32_mask(a, b);
-        assert_eq!(r, 0b00001101);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_loadu_pd() {
+        let src = _mm512_set1_pd(42.0);
+        let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_loadu_pd(src, m, black_box(p));
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpgt_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpgt_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b00001101);
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_loadu_pd() {
+        let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_loadu_pd(m, black_box(p));
+        let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmple_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        assert_eq!(
-            _mm512_cmple_epi32_mask(a, b),
-            !_mm512_cmpgt_epi32_mask(a, b)
-        )
+    unsafe fn test_mm512_mask_load_pd() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f64; 8], // 64 bytes
+        }
+        let src = _mm512_set1_pd(42.0);
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_mask_load_pd(src, m, black_box(p));
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_load_pd() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f64; 8], // 64 bytes
+        }
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm512_maskz_load_pd(m, black_box(p));
+        let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmple_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01111010_01111010;
-        assert_eq!(_mm512_mask_cmple_epi32_mask(mask, a, b), 0b01100000_0110000);
+    unsafe fn test_mm512_mask_storeu_pd() {
+        let mut r = [42_f64; 8];
+        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm512_mask_storeu_pd(r.as_mut_ptr(), m, a);
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(_mm512_loadu_pd(r.as_ptr()), e);
     }
 
-    #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmple_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let r = _mm256_cmple_epi32_mask(a, b);
-        assert_eq!(r, 0b00100101)
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_store_pd() {
+        #[repr(align(64))]
+        struct Align {
+            data: [f64; 8],
+        }
+        let mut r = Align { data: [42.0; 8] };
+        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm512_mask_store_pd(r.data.as_mut_ptr(), m, a);
+        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m512d(_mm512_load_pd(r.data.as_ptr()), e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmple_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmple_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b00100101)
+    unsafe fn test_mm256_mask_loadu_epi32() {
+        let src = _mm256_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_loadu_epi32(src, m, black_box(p));
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmple_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 200);
-        let b = _mm_set1_epi32(-1);
-        let r = _mm_cmple_epi32_mask(a, b);
-        assert_eq!(r, 0b00000010)
+    unsafe fn test_mm256_maskz_loadu_epi32() {
+        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_loadu_epi32(m, black_box(p));
+        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmple_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 200);
-        let b = _mm_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmple_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b00000010)
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpge_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        assert_eq!(
-            _mm512_cmpge_epi32_mask(a, b),
-            !_mm512_cmplt_epi32_mask(a, b)
-        )
+    unsafe fn test_mm256_mask_load_epi32() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i32; 8], // 32 bytes
+        }
+        let src = _mm256_set1_epi32(42);
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_load_epi32(src, m, black_box(p));
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpge_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01111010_01111010;
-        assert_eq!(
-            _mm512_mask_cmpge_epi32_mask(mask, a, b),
-            0b01111010_01111010
-        );
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_epi32() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i32; 8], // 32 bytes
+        }
+        let a = Align {
+            data: [1_i32, 2, 3, 4, 5, 6, 7, 8],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_load_epi32(m, black_box(p));
+        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8);
+        assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpge_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let r = _mm256_cmpge_epi32_mask(a, b);
-        assert_eq!(r, 0b11111010)
+    unsafe fn test_mm256_mask_storeu_epi32() {
+        let mut r = [42_i32; 8];
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        _mm256_mask_storeu_epi32(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(_mm256_loadu_epi32(r.as_ptr()), e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpge_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmpge_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b11111010)
+    unsafe fn test_mm256_mask_store_epi32() {
+        #[repr(align(64))]
+        struct Align {
+            data: [i32; 8],
+        }
+        let mut r = Align { data: [42; 8] };
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let m = 0b11001010;
+        _mm256_mask_store_epi32(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
+        assert_eq_m256i(_mm256_load_epi32(r.data.as_ptr()), e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpge_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
-        let b = _mm_set1_epi32(-1);
-        let r = _mm_cmpge_epi32_mask(a, b);
-        assert_eq!(r, 0b00001111)
+    unsafe fn test_mm256_mask_loadu_epi64() {
+        let src = _mm256_set1_epi64x(42);
+        let a = &[1_i64, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_loadu_epi64(src, m, black_box(p));
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpge_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
-        let b = _mm_set1_epi32(-1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpge_epi32_mask(mask, a, b);
-        assert_eq!(r, 0b00001111)
+    unsafe fn test_mm256_maskz_loadu_epi64() {
+        let a = &[1_i64, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_loadu_epi64(m, black_box(p));
+        let e = _mm256_setr_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpeq_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let m = _mm512_cmpeq_epi32_mask(b, a);
-        assert_eq!(m, 0b11001111_11001111);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_epi64() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i64; 4], // 32 bytes
+        }
+        let src = _mm256_set1_epi64x(42);
+        let a = Align {
+            data: [1_i64, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_load_epi64(src, m, black_box(p));
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpeq_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmpeq_epi32_mask(mask, b, a);
-        assert_eq!(r, 0b01001010_01001010);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_epi64() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i64; 4], // 32 bytes
+        }
+        let a = Align {
+            data: [1_i64, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_load_epi64(m, black_box(p));
+        let e = _mm256_setr_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpeq_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let m = _mm256_cmpeq_epi32_mask(b, a);
-        assert_eq!(m, 0b11001111);
+    unsafe fn test_mm256_mask_storeu_epi64() {
+        let mut r = [42_i64; 4];
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm256_mask_storeu_epi64(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(_mm256_loadu_epi64(r.as_ptr()), e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpeq_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let mask = 0b01111010;
-        let r = _mm256_mask_cmpeq_epi32_mask(mask, b, a);
-        assert_eq!(r, 0b01001010);
+    unsafe fn test_mm256_mask_store_epi64() {
+        #[repr(align(32))]
+        struct Align {
+            data: [i64; 4],
+        }
+        let mut r = Align { data: [42; 4] };
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm256_mask_store_epi64(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_epi64x(42, 2, 42, 4);
+        assert_eq_m256i(_mm256_load_epi64(r.data.as_ptr()), e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpeq_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let m = _mm_cmpeq_epi32_mask(b, a);
-        assert_eq!(m, 0b00001100);
+    unsafe fn test_mm256_mask_loadu_ps() {
+        let src = _mm256_set1_ps(42.0);
+        let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_loadu_ps(src, m, black_box(p));
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpeq_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpeq_epi32_mask(mask, b, a);
-        assert_eq!(r, 0b00001100);
+    unsafe fn test_mm256_maskz_loadu_ps() {
+        let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let p = a.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_loadu_ps(m, black_box(p));
+        let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmpneq_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let m = _mm512_cmpneq_epi32_mask(b, a);
-        assert_eq!(m, !_mm512_cmpeq_epi32_mask(b, a));
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_ps() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f32; 8], // 32 bytes
+        }
+        let src = _mm256_set1_ps(42.0);
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_mask_load_ps(src, m, black_box(p));
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmpneq_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
-        #[rustfmt::skip]
-        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let mask = 0b01111010_01111010;
-        let r = _mm512_mask_cmpneq_epi32_mask(mask, b, a);
-        assert_eq!(r, 0b00110010_00110010)
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_ps() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f32; 8], // 32 bytes
+        }
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b11001010;
+        let r = _mm256_maskz_load_ps(m, black_box(p));
+        let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmpneq_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let m = _mm256_cmpneq_epi32_mask(b, a);
-        assert_eq!(m, !_mm256_cmpeq_epi32_mask(b, a));
+    unsafe fn test_mm256_mask_storeu_ps() {
+        let mut r = [42_f32; 8];
+        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm256_mask_storeu_ps(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(_mm256_loadu_ps(r.as_ptr()), e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmpneq_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
-        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
-        let mask = 0b11111111;
-        let r = _mm256_mask_cmpneq_epi32_mask(mask, b, a);
-        assert_eq!(r, 0b00110011)
+    unsafe fn test_mm256_mask_store_ps() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f32; 8],
+        }
+        let mut r = Align { data: [42.0; 8] };
+        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let m = 0b11001010;
+        _mm256_mask_store_ps(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
+        assert_eq_m256(_mm256_load_ps(r.data.as_ptr()), e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmpneq_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let r = _mm_cmpneq_epi32_mask(b, a);
-        assert_eq!(r, 0b00000011)
+    unsafe fn test_mm256_mask_loadu_pd() {
+        let src = _mm256_set1_pd(42.0);
+        let a = &[1.0_f64, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_loadu_pd(src, m, black_box(p));
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmpneq_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set_epi32(0, 1, 13, 42);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmpneq_epi32_mask(mask, b, a);
-        assert_eq!(r, 0b00000011)
+    unsafe fn test_mm256_maskz_loadu_pd() {
+        let a = &[1.0_f64, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_loadu_pd(m, black_box(p));
+        let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m256d(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_cmp_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let m = _mm512_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b00000101_00000101);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_load_pd() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f64; 4], // 32 bytes
+        }
+        let src = _mm256_set1_pd(42.0);
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_mask_load_pd(src, m, black_box(p));
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_cmp_epi32_mask() {
-        #[rustfmt::skip]
-        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
-                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm512_set1_epi32(-1);
-        let mask = 0b01100110_01100110;
-        let r = _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b00000100_00000100);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_load_pd() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f64; 4], // 32 bytes
+        }
+        let a = Align {
+            data: [1.0_f64, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm256_maskz_load_pd(m, black_box(p));
+        let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m256d(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_cmp_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let m = _mm256_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b00000101);
+    unsafe fn test_mm256_mask_storeu_pd() {
+        let mut r = [42_f64; 4];
+        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm256_mask_storeu_pd(r.as_mut_ptr(), m, a);
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(_mm256_loadu_pd(r.as_ptr()), e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_epi32_mask() {
-        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
-        let b = _mm256_set1_epi32(-1);
-        let mask = 0b01100110;
-        let r = _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b00000100);
+    unsafe fn test_mm256_mask_store_pd() {
+        #[repr(align(32))]
+        struct Align {
+            data: [f64; 4],
+        }
+        let mut r = Align { data: [42.0; 4] };
+        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm256_mask_store_pd(r.data.as_mut_ptr(), m, a);
+        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m256d(_mm256_load_pd(r.data.as_ptr()), e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_cmp_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set1_epi32(1);
-        let m = _mm_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b00001010);
+    unsafe fn test_mm_mask_loadu_epi32() {
+        let src = _mm_set1_epi32(42);
+        let a = &[1_i32, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_loadu_epi32(src, m, black_box(p));
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f,avx512vl")]
-    unsafe fn test_mm_mask_cmp_epi32_mask() {
-        let a = _mm_set_epi32(0, 1, -1, 13);
-        let b = _mm_set1_epi32(1);
-        let mask = 0b11111111;
-        let r = _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b00001010);
+    unsafe fn test_mm_maskz_loadu_epi32() {
+        let a = &[1_i32, 2, 3, 4];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_loadu_epi32(m, black_box(p));
+        let e = _mm_setr_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set_epi8() {
-        let r = _mm512_set1_epi8(2);
-        assert_eq_m512i(
-            r,
-            _mm512_set_epi8(
-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-                2, 2, 2, 2, 2, 2, 2, 2,
-            ),
-        )
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_epi32() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i32; 4], // 32 bytes
+        }
+        let src = _mm_set1_epi32(42);
+        let a = Align {
+            data: [1_i32, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_load_epi32(src, m, black_box(p));
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set_epi16() {
-        let r = _mm512_set1_epi16(2);
-        assert_eq_m512i(
-            r,
-            _mm512_set_epi16(
-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-                2, 2, 2, 2,
-            ),
-        )
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_epi32() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i32; 4], // 16 bytes
+        }
+        let a = Align {
+            data: [1_i32, 2, 3, 4],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_load_epi32(m, black_box(p));
+        let e = _mm_setr_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set_epi32() {
-        let r = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(
-            r,
-            _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
-        )
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi32() {
+        let mut r = [42_i32; 4];
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm_mask_storeu_epi32(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(_mm_loadu_epi32(r.as_ptr()), e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setr_epi32() {
-        let r = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        assert_eq_m512i(
-            r,
-            _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
-        )
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_epi32() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i32; 4], // 16 bytes
+        }
+        let mut r = Align { data: [42; 4] };
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let m = 0b1010;
+        _mm_mask_store_epi32(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi32(42, 2, 42, 4);
+        assert_eq_m128i(_mm_load_epi32(r.data.as_ptr()), e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set1_epi8() {
-        let r = _mm512_set_epi8(
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2,
-        );
-        assert_eq_m512i(r, _mm512_set1_epi8(2));
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_epi64() {
+        let src = _mm_set1_epi64x(42);
+        let a = &[1_i64, 2];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_loadu_epi64(src, m, black_box(p));
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set1_epi16() {
-        let r = _mm512_set_epi16(
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2,
-        );
-        assert_eq_m512i(r, _mm512_set1_epi16(2));
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_epi64() {
+        let a = &[1_i64, 2];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_loadu_epi64(m, black_box(p));
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set1_epi32() {
-        let r = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        assert_eq_m512i(r, _mm512_set1_epi32(2));
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_epi64() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i64; 2], // 16 bytes
+        }
+        let src = _mm_set1_epi64x(42);
+        let a = Align { data: [1_i64, 2] };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_load_epi64(src, m, black_box(p));
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setzero_si512() {
-        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_si512());
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_epi64() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i64; 2], // 16 bytes
+        }
+        let a = Align { data: [1_i64, 2] };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_load_epi64(m, black_box(p));
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setzero_epi32() {
-        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_epi32());
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_epi64() {
+        let mut r = [42_i64; 2];
+        let a = _mm_setr_epi64x(1, 2);
+        let m = 0b10;
+        _mm_mask_storeu_epi64(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(_mm_loadu_epi64(r.as_ptr()), e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set_ps() {
-        let r = _mm512_setr_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(
-            r,
-            _mm512_set_ps(
-                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-            ),
-        )
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_epi64() {
+        #[repr(align(16))]
+        struct Align {
+            data: [i64; 2], // 16 bytes
+        }
+        let mut r = Align { data: [42; 2] };
+        let a = _mm_setr_epi64x(1, 2);
+        let m = 0b10;
+        _mm_mask_store_epi64(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_epi64x(42, 2);
+        assert_eq_m128i(_mm_load_epi64(r.data.as_ptr()), e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setr_ps() {
-        let r = _mm512_set_ps(
-            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
-        );
-        assert_eq_m512(
-            r,
-            _mm512_setr_ps(
-                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
-            ),
-        )
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_ps() {
+        let src = _mm_set1_ps(42.0);
+        let a = &[1.0_f32, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_loadu_ps(src, m, black_box(p));
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set1_ps() {
-        #[rustfmt::skip]
-        let expected = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2.,
-                                     2., 2., 2., 2., 2., 2., 2., 2.);
-        assert_eq_m512(expected, _mm512_set1_ps(2.));
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_ps() {
+        let a = &[1.0_f32, 2.0, 3.0, 4.0];
+        let p = a.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_loadu_ps(m, black_box(p));
+        let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set4_epi32() {
-        let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
-        assert_eq_m512i(r, _mm512_set4_epi32(4, 3, 2, 1));
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_ps() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f32; 4], // 16 bytes
+        }
+        let src = _mm_set1_ps(42.0);
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_mask_load_ps(src, m, black_box(p));
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_set4_ps() {
-        let r = _mm512_set_ps(
-            4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
-        );
-        assert_eq_m512(r, _mm512_set4_ps(4., 3., 2., 1.));
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_ps() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f32; 4], // 16 bytes
+        }
+        let a = Align {
+            data: [1.0_f32, 2.0, 3.0, 4.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b1010;
+        let r = _mm_maskz_load_ps(m, black_box(p));
+        let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0);
+        assert_eq_m128(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setr4_epi32() {
-        let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
-        assert_eq_m512i(r, _mm512_setr4_epi32(1, 2, 3, 4));
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_ps() {
+        let mut r = [42_f32; 4];
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm_mask_storeu_ps(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(_mm_loadu_ps(r.as_ptr()), e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setr4_ps() {
-        let r = _mm512_set_ps(
-            4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
-        );
-        assert_eq_m512(r, _mm512_setr4_ps(1., 2., 3., 4.));
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_ps() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f32; 4], // 16 bytes
+        }
+        let mut r = Align { data: [42.0; 4] };
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let m = 0b1010;
+        _mm_mask_store_ps(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
+        assert_eq_m128(_mm_load_ps(r.data.as_ptr()), e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setzero_ps() {
-        assert_eq_m512(_mm512_setzero_ps(), _mm512_set1_ps(0.));
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_loadu_pd() {
+        let src = _mm_set1_pd(42.0);
+        let a = &[1.0_f64, 2.0];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_loadu_pd(src, m, black_box(p));
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_setzero() {
-        assert_eq_m512(_mm512_setzero(), _mm512_set1_ps(0.));
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_loadu_pd() {
+        let a = &[1.0_f64, 2.0];
+        let p = a.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_loadu_pd(m, black_box(p));
+        let e = _mm_setr_pd(0.0, 2.0);
+        assert_eq_m128d(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_loadu_pd() {
-        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
-        let p = a.as_ptr();
-        let r = _mm512_loadu_pd(black_box(p));
-        let e = _mm512_setr_pd(4., 3., 2., 5., 8., 9., 64., 50.);
-        assert_eq_m512d(r, e);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_load_pd() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f64; 2], // 16 bytes
+        }
+        let src = _mm_set1_pd(42.0);
+        let a = Align {
+            data: [1.0_f64, 2.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_mask_load_pd(src, m, black_box(p));
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_storeu_pd() {
-        let a = _mm512_set1_pd(9.);
-        let mut r = _mm512_undefined_pd();
-        _mm512_storeu_pd(&mut r as *mut _ as *mut f64, a);
-        assert_eq_m512d(r, a);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_load_pd() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f64; 2], // 16 bytes
+        }
+        let a = Align {
+            data: [1.0_f64, 2.0],
+        };
+        let p = a.data.as_ptr();
+        let m = 0b10;
+        let r = _mm_maskz_load_pd(m, black_box(p));
+        let e = _mm_setr_pd(0.0, 2.0);
+        assert_eq_m128d(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_loadu_ps() {
-        let a = &[
-            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
-        ];
-        let p = a.as_ptr();
-        let r = _mm512_loadu_ps(black_box(p));
-        let e = _mm512_setr_ps(
-            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
-        );
-        assert_eq_m512(r, e);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_storeu_pd() {
+        let mut r = [42_f64; 2];
+        let a = _mm_setr_pd(1.0, 2.0);
+        let m = 0b10;
+        _mm_mask_storeu_pd(r.as_mut_ptr(), m, a);
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(_mm_loadu_pd(r.as_ptr()), e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_storeu_ps() {
-        let a = _mm512_set1_ps(9.);
-        let mut r = _mm512_undefined_ps();
-        _mm512_storeu_ps(&mut r as *mut _ as *mut f32, a);
-        assert_eq_m512(r, a);
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_store_pd() {
+        #[repr(align(16))]
+        struct Align {
+            data: [f64; 2], // 16 bytes
+        }
+        let mut r = Align { data: [42.0; 2] };
+        let a = _mm_setr_pd(1.0, 2.0);
+        let m = 0b10;
+        _mm_mask_store_pd(r.data.as_mut_ptr(), m, a);
+        let e = _mm_setr_pd(42.0, 2.0);
+        assert_eq_m128d(_mm_load_pd(r.data.as_ptr()), e);
     }
 
     #[simd_test(enable = "avx512f")]