Skip to content

Commit

Permalink
Improve performance of saturating float to int casts (#108)
Browse files Browse the repository at this point in the history
* Change target folder for native cpu build in CI

GitHub Actions seems to have started scanning executables. However while
the scan is still in progress, we redo the build for the native CPU.
Cargo will then try to replace the previously built executable and fail
because the scan is still in progress. By switching the target folder
for the native build, we can avoid this problem.

* Improve performance of saturating float to int casts

I randomly stumbled across a Chrome bug ticket where they reported that
their lowering of saturating float to int casts is suboptimal:

https://bugs.chromium.org/p/v8/issues/detail?id=12094

They by now merged a fix. The code I ported was based on Chrome and thus
the same performance pitfall applies to wide.
  • Loading branch information
CryZe authored Sep 14, 2021
1 parent 9352715 commit 844d8c7
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 29 deletions.
15 changes: 10 additions & 5 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,27 @@ jobs:
matrix:
rust:
# x86 without sse/sse2 on by default
- { target: i586-pc-windows-msvc, toolchain: 1.52.0, os: windows-latest }
- { target: i586-pc-windows-msvc, toolchain: 1.52.1, os: windows-latest }
- { target: i586-pc-windows-msvc, toolchain: stable, os: windows-latest }
- { target: i586-pc-windows-msvc, toolchain: beta, os: windows-latest }
- { target: i586-pc-windows-msvc, toolchain: nightly, os: windows-latest }
# x86
- { target: i686-pc-windows-msvc, toolchain: 1.52.0, os: windows-latest }
- { target: i686-pc-windows-msvc, toolchain: 1.52.1, os: windows-latest }
- { target: i686-pc-windows-msvc, toolchain: stable, os: windows-latest }
- { target: i686-pc-windows-msvc, toolchain: beta, os: windows-latest }
- { target: i686-pc-windows-msvc, toolchain: nightly, os: windows-latest }
# x86_64
- { target: x86_64-pc-windows-msvc, toolchain: 1.52.0, os: windows-latest }
- { target: x86_64-pc-windows-msvc, toolchain: 1.52.1, os: windows-latest }
- { target: x86_64-pc-windows-msvc, toolchain: stable, os: windows-latest }
- { target: x86_64-pc-windows-msvc, toolchain: beta, os: windows-latest }
- { target: x86_64-pc-windows-msvc, toolchain: nightly, os: windows-latest }
## arm
#- { target: arm-unknown-linux-gnueabihf, toolchain: 1.52.0, os: ubuntu-latest }
#- { target: arm-unknown-linux-gnueabihf, toolchain: 1.52.1, os: ubuntu-latest }
#- { target: arm-unknown-linux-gnueabihf, toolchain: stable, os: ubuntu-latest }
#- { target: arm-unknown-linux-gnueabihf, toolchain: beta, os: ubuntu-latest }
#- { target: arm-unknown-linux-gnueabihf, toolchain: nightly, os: ubuntu-latest }
## aarch64
#- { target: aarch64-unknown-linux-gnu, toolchain: 1.52.0, os: ubuntu-latest }
#- { target: aarch64-unknown-linux-gnu, toolchain: 1.52.1, os: ubuntu-latest }
#- { target: aarch64-unknown-linux-gnu, toolchain: stable, os: ubuntu-latest }
#- { target: aarch64-unknown-linux-gnu, toolchain: beta, os: ubuntu-latest }
#- { target: aarch64-unknown-linux-gnu, toolchain: nightly, os: ubuntu-latest }
Expand Down Expand Up @@ -75,6 +75,11 @@ jobs:
- name: Test with 'native' CPU features + No Default Cargo Features
if: matrix.rust.target == 'i586-pc-windows-msvc' || matrix.rust.target == 'i686-pc-windows-msvc' || matrix.rust.target == 'x86_64-pc-windows-msvc' || matrix.rust.target == 'wasm32-wasi'
run: cargo test --target ${{ matrix.rust.target }} --no-default-features
env:
CARGO_TARGET_DIR: "target-native"

- name: Test with 'native' CPU features + All Cargo Features
if: matrix.rust.target == 'i586-pc-windows-msvc' || matrix.rust.target == 'i686-pc-windows-msvc' || matrix.rust.target == 'x86_64-pc-windows-msvc' || matrix.rust.target == 'wasm32-wasi'
run: cargo test --target ${{ matrix.rust.target }} --all-features
env:
CARGO_TARGET_DIR: "target-native"
16 changes: 8 additions & 8 deletions src/f32x4_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -676,12 +676,12 @@ impl f32x4 {
pub fn round_int(self) -> i32x4 {
pick! {
if #[cfg(target_feature="sse2")] {
// Based on: https://github.com/v8/v8/blob/121df413a3abb5272e971e61bebf0e84efa175f2/src/compiler/backend/ia32/code-generator-ia32.cc#L2457
// Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
let non_nan_mask = self.cmp_eq(self);
let non_nan = self & non_nan_mask;
let scratch: i32x4 = cast(non_nan_mask ^ non_nan);
let dst: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse));
dst ^ ((scratch & dst) >> 31)
let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0)));
let cast: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse));
flip_to_max ^ cast
} else if #[cfg(target_feature="simd128")] {
cast(Self { simd: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd)) })
} else {
Expand Down Expand Up @@ -719,12 +719,12 @@ impl f32x4 {
pub fn trunc_int(self) -> i32x4 {
pick! {
if #[cfg(target_feature="sse2")] {
// Based on: https://github.com/v8/v8/blob/121df413a3abb5272e971e61bebf0e84efa175f2/src/compiler/backend/ia32/code-generator-ia32.cc#L2457
// Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
let non_nan_mask = self.cmp_eq(self);
let non_nan = self & non_nan_mask;
let scratch: i32x4 = cast(non_nan_mask ^ non_nan);
let dst: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse));
dst ^ ((scratch & dst) >> 31)
let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0)));
let cast: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse));
flip_to_max ^ cast
} else if #[cfg(target_feature="simd128")] {
cast(Self { simd: i32x4_trunc_sat_f32x4(self.simd) })
} else {
Expand Down
32 changes: 16 additions & 16 deletions src/f32x8_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -801,19 +801,19 @@ impl f32x8 {
pub fn round_int(self) -> i32x8 {
pick! {
if #[cfg(target_feature="avx")] {
// Based on: https://github.com/v8/v8/blob/121df413a3abb5272e971e61bebf0e84efa175f2/src/compiler/backend/ia32/code-generator-ia32.cc#L2457
// Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
let non_nan_mask = self.cmp_eq(self);
let non_nan = self & non_nan_mask;
let scratch: i32x8 = cast(non_nan_mask ^ non_nan);
let dst: i32x8 = cast(convert_to_i32_m256i_from_m256(non_nan.avx));
dst ^ ((scratch & dst) >> 31)
let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
let cast: i32x8 = cast(convert_to_i32_m256i_from_m256(non_nan.avx));
flip_to_max ^ cast
} else if #[cfg(target_feature="sse2")] {
// Based on: https://github.com/v8/v8/blob/121df413a3abb5272e971e61bebf0e84efa175f2/src/compiler/backend/ia32/code-generator-ia32.cc#L2457
// Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
let non_nan_mask = self.cmp_eq(self);
let non_nan = self & non_nan_mask;
let scratch: i32x8 = cast(non_nan_mask ^ non_nan);
let dst: i32x8 = i32x8 { sse0: convert_to_i32_m128i_from_m128(non_nan.sse0), sse1: convert_to_i32_m128i_from_m128(non_nan.sse1) };
dst ^ ((scratch & dst) >> 31)
let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
let cast: i32x8 = i32x8 { sse0: convert_to_i32_m128i_from_m128(non_nan.sse0), sse1: convert_to_i32_m128i_from_m128(non_nan.sse1) };
flip_to_max ^ cast
} else if #[cfg(target_feature="simd128")] {
cast(Self {
simd0: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd0)),
Expand Down Expand Up @@ -860,19 +860,19 @@ impl f32x8 {
pub fn trunc_int(self) -> i32x8 {
pick! {
if #[cfg(target_feature="avx")] {
// Based on: https://github.com/v8/v8/blob/121df413a3abb5272e971e61bebf0e84efa175f2/src/compiler/backend/ia32/code-generator-ia32.cc#L2457
// Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
let non_nan_mask = self.cmp_eq(self);
let non_nan = self & non_nan_mask;
let scratch: i32x8 = cast(non_nan_mask ^ non_nan);
let dst: i32x8 = cast(convert_truncate_to_i32_m256i_from_m256(non_nan.avx));
dst ^ ((scratch & dst) >> 31)
let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
let cast: i32x8 = cast(convert_truncate_to_i32_m256i_from_m256(non_nan.avx));
flip_to_max ^ cast
} else if #[cfg(target_feature="sse2")] {
// Based on: https://github.com/v8/v8/blob/121df413a3abb5272e971e61bebf0e84efa175f2/src/compiler/backend/ia32/code-generator-ia32.cc#L2457
// Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
let non_nan_mask = self.cmp_eq(self);
let non_nan = self & non_nan_mask;
let scratch: i32x8 = cast(non_nan_mask ^ non_nan);
let dst: i32x8 = i32x8 { sse0: truncate_m128_to_m128i(non_nan.sse0), sse1: truncate_m128_to_m128i(non_nan.sse1) };
dst ^ ((scratch & dst) >> 31)
let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
let cast: i32x8 = i32x8 { sse0: truncate_m128_to_m128i(non_nan.sse0), sse1: truncate_m128_to_m128i(non_nan.sse1) };
flip_to_max ^ cast
} else if #[cfg(target_feature="simd128")] {
cast(Self {
simd0: i32x4_trunc_sat_f32x4(self.simd0),
Expand Down

0 comments on commit 844d8c7

Please sign in to comment.