sse: _mm_cvtpi16_ps, _mm_cvtpu16_ps, _mm_cvtpi8_ps, _mm_cvtpu8_ps (ru…

…st-lang#255) * sse: _mm_cvtpi16_ps, _mm_cvtpu16_ps, _mm_cvtpi8_ps, _mm_cvtpu8_ps And mmx: _mm_cmpgt_pi8 _mm_cmpgt_pi16 _mm_unpackhi_pi16 _mm_unpacklo_pi8 _mm_unpacklo_pi16 * Fix: literal out of range
danielverkamp · Dec 30, 2017 · 2d77311 · 2d77311
1 parent ab543f1
commit 2d77311
Show file tree

Hide file tree

Showing 2 changed files with 183 additions and 0 deletions.
diff --git a/coresimd/src/x86/i686/mmx.rs b/coresimd/src/x86/i686/mmx.rs
@@ -48,12 +48,73 @@ pub unsafe fn _mm_packs_pi32(a: i32x2, b: i32x2) -> i16x4 {
     mem::transmute(packssdw(mem::transmute(a), mem::transmute(b)))
 }
 
+/// Compares the 8-bit integer elements of two 64-bit integer vectors of
+/// [8 x i8] to determine if the element of the first vector is greater than
+/// the corresponding element of the second vector.
+///
+/// The comparison yields 0 for false, 0xFF for true.
+#[inline(always)]
+#[target_feature = "+mmx"]
+#[cfg_attr(test, assert_instr(pcmpgtb))]
+pub unsafe fn _mm_cmpgt_pi8(a: i8x8, b: i8x8) -> i8x8 {
+    mem::transmute(pcmpgtb(mem::transmute(a), mem::transmute(b)))
+}
+
+/// Compares the 16-bit integer elements of two 64-bit integer vectors of
+/// [4 x i16] to determine if the element of the first vector is greater than
+/// the corresponding element of the second vector.
+///
+/// The comparison yields 0 for false, 0xFFFF for true.
+#[inline(always)]
+#[target_feature = "+mmx"]
+#[cfg_attr(test, assert_instr(pcmpgtw))]
+pub unsafe fn _mm_cmpgt_pi16(a: i16x4, b: i16x4) -> i16x4 {
+    mem::transmute(pcmpgtw(mem::transmute(a), mem::transmute(b)))
+}
+
+/// Unpacks the upper 32 bits from two 64-bit integer vectors of
+/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
+#[inline(always)]
+#[target_feature = "+mmx"]
+#[cfg_attr(test, assert_instr(punpckhwd))] // FIXME punpcklbw expected
+pub unsafe fn _mm_unpackhi_pi16(a: i16x4, b: i16x4) -> i16x4 {
+    mem::transmute(punpckhwd(mem::transmute(a), mem::transmute(b)))
+}
+
+/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
+/// and interleaves them into a 64-bit integer vector of [8 x i8].
+#[inline(always)]
+#[target_feature = "+mmx"]
+#[cfg_attr(test, assert_instr(punpcklbw))]
+pub unsafe fn _mm_unpacklo_pi8(a: i8x8, b: i8x8) -> i8x8 {
+    mem::transmute(punpcklbw(mem::transmute(a), mem::transmute(b)))
+}
+
+/// Unpacks the lower 32 bits from two 64-bit integer vectors of
+/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
+#[inline(always)]
+#[target_feature = "+mmx"]
+#[cfg_attr(test, assert_instr(punpcklwd))]
+pub unsafe fn _mm_unpacklo_pi16(a: i16x4, b: i16x4) -> i16x4 {
+    mem::transmute(punpcklwd(mem::transmute(a), mem::transmute(b)))
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.mmx.packsswb"]
     fn packsswb(a: __m64, b: __m64) -> __m64;
     #[link_name = "llvm.x86.mmx.packssdw"]
     fn packssdw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pcmpgt.b"]
+    fn pcmpgtb(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.pcmpgt.w"]
+    fn pcmpgtw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.punpckhwd"]
+    fn punpckhwd(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.punpcklbw"]
+    fn punpcklbw(a: __m64, b: __m64) -> __m64;
+    #[link_name = "llvm.x86.mmx.punpcklwd"]
+    fn punpcklwd(a: __m64, b: __m64) -> __m64;
 }
 
 #[cfg(test)]
@@ -83,4 +144,44 @@ mod tests {
         let r = i16x4::new(-1, 2, -5, 6);
         assert_eq!(r, mmx::_mm_packs_pi32(a, b));
     }
+
+    #[simd_test = "mmx"]
+    unsafe fn _mm_cmpgt_pi8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = i8x8::new(8, 7, 6, 5, 4, 3, 2, 1);
+        let r = i8x8::new(0, 0, 0, 0, 0, -1, -1, -1);
+        assert_eq!(r, mmx::_mm_cmpgt_pi8(a, b));
+    }
+
+    #[simd_test = "mmx"]
+    unsafe fn _mm_cmpgt_pi16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let b = i16x4::new(4, 3, 2, 1);
+        let r = i16x4::new(0, 0, 0, -1);
+        assert_eq!(r, mmx::_mm_cmpgt_pi16(a, b));
+    }
+
+    #[simd_test = "mmx"]
+    unsafe fn _mm_unpackhi_pi16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let b = i16x4::new(4, 5, 6, 7);
+        let r = i16x4::new(2, 6, 3, 7);
+        assert_eq!(r, mmx::_mm_unpackhi_pi16(a, b));
+    }
+
+    #[simd_test = "mmx"]
+    unsafe fn _mm_unpacklo_pi8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b = i8x8::new(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = i8x8::new(0, 8, 1, 9, 2, 10, 3, 11);
+        assert_eq!(r, mmx::_mm_unpacklo_pi8(a, b));
+    }
+
+    #[simd_test = "mmx"]
+    unsafe fn _mm_unpacklo_pi16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let b = i16x4::new(4, 5, 6, 7);
+        let r = i16x4::new(0, 4, 1, 5);
+        assert_eq!(r, mmx::_mm_unpacklo_pi16(a, b));
+    }
 }
diff --git a/coresimd/src/x86/i686/sse.rs b/coresimd/src/x86/i686/sse.rs
@@ -221,6 +221,56 @@ pub unsafe fn _mm_cvt_pi2ps(a: f32x4, b: i32x2) -> f32x4 {
     _mm_cvtpi32_ps(a, b)
 }
 
+/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
+/// float].
+#[inline(always)]
+#[target_feature = "+sse"]
+pub unsafe fn _mm_cvtpi16_ps(a: i16x4) -> f32x4 {
+    let b = mmx::_mm_setzero_si64();
+    let b = mmx::_mm_cmpgt_pi16(mem::transmute(b), a);
+    let c = mmx::_mm_unpackhi_pi16(a, b);
+    let r = i586::_mm_setzero_ps();
+    let r = cvtpi2ps(r, mem::transmute(c));
+    let r = i586::_mm_movelh_ps(r, r);
+    let c = mmx::_mm_unpacklo_pi16(a, b);
+    cvtpi2ps(r, mem::transmute(c))
+}
+
+/// Converts a 64-bit vector of 16-bit unsigned integer values into a
+/// 128-bit vector of [4 x float].
+#[inline(always)]
+#[target_feature = "+sse"]
+pub unsafe fn _mm_cvtpu16_ps(a: u16x4) -> f32x4 {
+    let b = mem::transmute(mmx::_mm_setzero_si64());
+    let c = mmx::_mm_unpackhi_pi16(a.as_i16x4(), b);
+    let r = i586::_mm_setzero_ps();
+    let r = cvtpi2ps(r, mem::transmute(c));
+    let r = i586::_mm_movelh_ps(r, r);
+    let c = mmx::_mm_unpacklo_pi16(a.as_i16x4(), b);
+    cvtpi2ps(r, mem::transmute(c))
+}
+
+/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
+/// into a 128-bit vector of [4 x float].
+#[inline(always)]
+#[target_feature = "+sse"]
+pub unsafe fn _mm_cvtpi8_ps(a: i8x8) -> f32x4 {
+    let b = mmx::_mm_setzero_si64();
+    let b = mmx::_mm_cmpgt_pi8(mem::transmute(b), a);
+    let b = mmx::_mm_unpacklo_pi8(a, b);
+    _mm_cvtpi16_ps(mem::transmute(b))
+}
+
+/// Converts the lower four unsigned 8-bit integer values from a 64-bit
+/// vector of [8 x u8] into a 128-bit vector of [4 x float].
+#[inline(always)]
+#[target_feature = "+sse"]
+pub unsafe fn _mm_cvtpu8_ps(a: u8x8) -> f32x4 {
+    let b = mmx::_mm_setzero_si64();
+    let b = mmx::_mm_unpacklo_pi8(a.as_i8x8(), mem::transmute(b));
+    _mm_cvtpi16_ps(mem::transmute(b))
+}
+
 /// Converts the two 32-bit signed integer values from each 64-bit vector
 /// operand of [2 x i32] into a 128-bit vector of [4 x float].
 #[inline(always)]
@@ -507,6 +557,38 @@ mod tests {
         assert_eq!(r, expected);
     }
 
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvtpi16_ps() {
+        let a = i16x4::new(1, 2, 3, 4);
+        let expected = f32x4::new(1., 2., 3., 4.);
+        let r = sse::_mm_cvtpi16_ps(a);
+        assert_eq!(r, expected);
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvtpu16_ps() {
+        let a = u16x4::new(1, 2, 3, 4);
+        let expected = f32x4::new(1., 2., 3., 4.);
+        let r = sse::_mm_cvtpu16_ps(a);
+        assert_eq!(r, expected);
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvtpi8_ps() {
+        let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let expected = f32x4::new(1., 2., 3., 4.);
+        let r = sse::_mm_cvtpi8_ps(a);
+        assert_eq!(r, expected);
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvtpu8_ps() {
+        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let expected = f32x4::new(1., 2., 3., 4.);
+        let r = sse::_mm_cvtpu8_ps(a);
+        assert_eq!(r, expected);
+    }
+
     #[simd_test = "sse"]
     unsafe fn _mm_cvtpi32x2_ps() {
         let a = i32x2::new(1, 2);