Improve performance of saturating float to int casts (#108)

* Change target folder for native cpu build in CI GitHub Actions seems to have started scanning executables. However while the scan is still in progress, we redo the build for the native CPU. Cargo will then try to replace the previously built executable and fail because the scan is still in progress. By switching the target folder for the native build, we can avoid this problem. * Improve performance of saturating float to int casts I randomly stumbled across a Chrome bug ticket where they reported that their lowering of saturating float to int casts is suboptimal: https://bugs.chromium.org/p/v8/issues/detail?id=12094 They by now merged a fix. The code I ported was based on Chrome and thus the same performance pitfall applies to wide.
Lokathor · Sep 14, 2021 · 844d8c7 · 844d8c7
1 parent 9352715
commit 844d8c7
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 29 deletions.
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -11,27 +11,27 @@ jobs:
       matrix:
         rust:
         # x86 without sse/sse2 on by default
-        - { target: i586-pc-windows-msvc, toolchain: 1.52.0, os: windows-latest }
+        - { target: i586-pc-windows-msvc, toolchain: 1.52.1, os: windows-latest }
         - { target: i586-pc-windows-msvc, toolchain: stable, os: windows-latest }
         - { target: i586-pc-windows-msvc, toolchain: beta, os: windows-latest }
         - { target: i586-pc-windows-msvc, toolchain: nightly, os: windows-latest }
         # x86
-        - { target: i686-pc-windows-msvc, toolchain: 1.52.0, os: windows-latest }
+        - { target: i686-pc-windows-msvc, toolchain: 1.52.1, os: windows-latest }
         - { target: i686-pc-windows-msvc, toolchain: stable, os: windows-latest }
         - { target: i686-pc-windows-msvc, toolchain: beta, os: windows-latest }
         - { target: i686-pc-windows-msvc, toolchain: nightly, os: windows-latest }
         # x86_64
-        - { target: x86_64-pc-windows-msvc, toolchain: 1.52.0, os: windows-latest }
+        - { target: x86_64-pc-windows-msvc, toolchain: 1.52.1, os: windows-latest }
         - { target: x86_64-pc-windows-msvc, toolchain: stable, os: windows-latest }
         - { target: x86_64-pc-windows-msvc, toolchain: beta, os: windows-latest }
         - { target: x86_64-pc-windows-msvc, toolchain: nightly, os: windows-latest }
         ## arm
-        #- { target: arm-unknown-linux-gnueabihf, toolchain: 1.52.0, os: ubuntu-latest }
+        #- { target: arm-unknown-linux-gnueabihf, toolchain: 1.52.1, os: ubuntu-latest }
         #- { target: arm-unknown-linux-gnueabihf, toolchain: stable, os: ubuntu-latest }
         #- { target: arm-unknown-linux-gnueabihf, toolchain: beta, os: ubuntu-latest }
         #- { target: arm-unknown-linux-gnueabihf, toolchain: nightly, os: ubuntu-latest }
         ## aarch64
-        #- { target: aarch64-unknown-linux-gnu, toolchain: 1.52.0, os: ubuntu-latest }
+        #- { target: aarch64-unknown-linux-gnu, toolchain: 1.52.1, os: ubuntu-latest }
         #- { target: aarch64-unknown-linux-gnu, toolchain: stable, os: ubuntu-latest }
         #- { target: aarch64-unknown-linux-gnu, toolchain: beta, os: ubuntu-latest }
         #- { target: aarch64-unknown-linux-gnu, toolchain: nightly, os: ubuntu-latest }
@@ -75,6 +75,11 @@ jobs:
     - name: Test with 'native' CPU features + No Default Cargo Features
       if: matrix.rust.target == 'i586-pc-windows-msvc' || matrix.rust.target == 'i686-pc-windows-msvc' || matrix.rust.target == 'x86_64-pc-windows-msvc' || matrix.rust.target == 'wasm32-wasi'
       run: cargo test --target ${{ matrix.rust.target }} --no-default-features
+      env:
+        CARGO_TARGET_DIR: "target-native"
+
     - name: Test with 'native' CPU features + All Cargo Features
       if: matrix.rust.target == 'i586-pc-windows-msvc' || matrix.rust.target == 'i686-pc-windows-msvc' || matrix.rust.target == 'x86_64-pc-windows-msvc' || matrix.rust.target == 'wasm32-wasi'
       run: cargo test --target ${{ matrix.rust.target }} --all-features
+      env:
+        CARGO_TARGET_DIR: "target-native"
diff --git a/src/f32x4_.rs b/src/f32x4_.rs
@@ -676,12 +676,12 @@ impl f32x4 {
   pub fn round_int(self) -> i32x4 {
     pick! {
       if #[cfg(target_feature="sse2")] {
-        // Based on: https://github.com/v8/v8/blob/121df413a3abb5272e971e61bebf0e84efa175f2/src/compiler/backend/ia32/code-generator-ia32.cc#L2457
+        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
         let non_nan_mask = self.cmp_eq(self);
         let non_nan = self & non_nan_mask;
-        let scratch: i32x4 = cast(non_nan_mask ^ non_nan);
-        let dst: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse));
-        dst ^ ((scratch & dst) >> 31)
+        let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0)));
+        let cast: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse));
+        flip_to_max ^ cast
       } else if #[cfg(target_feature="simd128")] {
         cast(Self { simd: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd)) })
       } else {
@@ -719,12 +719,12 @@ impl f32x4 {
   pub fn trunc_int(self) -> i32x4 {
     pick! {
       if #[cfg(target_feature="sse2")] {
-        // Based on: https://github.com/v8/v8/blob/121df413a3abb5272e971e61bebf0e84efa175f2/src/compiler/backend/ia32/code-generator-ia32.cc#L2457
+        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
         let non_nan_mask = self.cmp_eq(self);
         let non_nan = self & non_nan_mask;
-        let scratch: i32x4 = cast(non_nan_mask ^ non_nan);
-        let dst: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse));
-        dst ^ ((scratch & dst) >> 31)
+        let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0)));
+        let cast: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse));
+        flip_to_max ^ cast
       } else if #[cfg(target_feature="simd128")] {
         cast(Self { simd: i32x4_trunc_sat_f32x4(self.simd) })
       } else {

diff --git a/src/f32x8_.rs b/src/f32x8_.rs
@@ -801,19 +801,19 @@ impl f32x8 {
   pub fn round_int(self) -> i32x8 {
     pick! {
       if #[cfg(target_feature="avx")] {
-        // Based on: https://github.com/v8/v8/blob/121df413a3abb5272e971e61bebf0e84efa175f2/src/compiler/backend/ia32/code-generator-ia32.cc#L2457
+        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
         let non_nan_mask = self.cmp_eq(self);
         let non_nan = self & non_nan_mask;
-        let scratch: i32x8 = cast(non_nan_mask ^ non_nan);
-        let dst: i32x8 = cast(convert_to_i32_m256i_from_m256(non_nan.avx));
-        dst ^ ((scratch & dst) >> 31)
+        let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
+        let cast: i32x8 = cast(convert_to_i32_m256i_from_m256(non_nan.avx));
+        flip_to_max ^ cast
       } else if #[cfg(target_feature="sse2")] {
-        // Based on: https://github.com/v8/v8/blob/121df413a3abb5272e971e61bebf0e84efa175f2/src/compiler/backend/ia32/code-generator-ia32.cc#L2457
+        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
         let non_nan_mask = self.cmp_eq(self);
         let non_nan = self & non_nan_mask;
-        let scratch: i32x8 = cast(non_nan_mask ^ non_nan);
-        let dst: i32x8 = i32x8 { sse0: convert_to_i32_m128i_from_m128(non_nan.sse0), sse1: convert_to_i32_m128i_from_m128(non_nan.sse1) };
-        dst ^ ((scratch & dst) >> 31)
+        let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
+        let cast: i32x8 = i32x8 { sse0: convert_to_i32_m128i_from_m128(non_nan.sse0), sse1: convert_to_i32_m128i_from_m128(non_nan.sse1) };
+        flip_to_max ^ cast
       } else if #[cfg(target_feature="simd128")] {
         cast(Self {
           simd0: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd0)),
@@ -860,19 +860,19 @@ impl f32x8 {
   pub fn trunc_int(self) -> i32x8 {
     pick! {
         if #[cfg(target_feature="avx")] {
-        // Based on: https://github.com/v8/v8/blob/121df413a3abb5272e971e61bebf0e84efa175f2/src/compiler/backend/ia32/code-generator-ia32.cc#L2457
+        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
         let non_nan_mask = self.cmp_eq(self);
         let non_nan = self & non_nan_mask;
-        let scratch: i32x8 = cast(non_nan_mask ^ non_nan);
-        let dst: i32x8 = cast(convert_truncate_to_i32_m256i_from_m256(non_nan.avx));
-        dst ^ ((scratch & dst) >> 31)
+        let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
+        let cast: i32x8 = cast(convert_truncate_to_i32_m256i_from_m256(non_nan.avx));
+        flip_to_max ^ cast
       } else if #[cfg(target_feature="sse2")] {
-        // Based on: https://github.com/v8/v8/blob/121df413a3abb5272e971e61bebf0e84efa175f2/src/compiler/backend/ia32/code-generator-ia32.cc#L2457
+        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
         let non_nan_mask = self.cmp_eq(self);
         let non_nan = self & non_nan_mask;
-        let scratch: i32x8 = cast(non_nan_mask ^ non_nan);
-        let dst: i32x8 = i32x8 { sse0: truncate_m128_to_m128i(non_nan.sse0), sse1: truncate_m128_to_m128i(non_nan.sse1) };
-        dst ^ ((scratch & dst) >> 31)
+        let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
+        let cast: i32x8 = i32x8 { sse0: truncate_m128_to_m128i(non_nan.sse0), sse1: truncate_m128_to_m128i(non_nan.sse1) };
+        flip_to_max ^ cast
       } else if #[cfg(target_feature="simd128")] {
         cast(Self {
           simd0: i32x4_trunc_sat_f32x4(self.simd0),