From d840fb6b9298d2e3364bd2f243b47e48d795fd95 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Thu, 5 Sep 2024 09:03:28 -0700
Subject: [PATCH] Use the trifecta div algorithm for 128-bit div on wasm

This commit updates the `#[cfg]` annotations used to select the
implementation of 128-bit division in compiler-builtins on wasm targets.
This is done with relation to
https://github.com/WebAssembly/128-bit-arithmetic where performance of
128-bit operations is being investigated on WebAssembly. While I don't
know much about the particulars of the two algorithms involved here the
comments indicate that the "trifecta" variant is preferred if possible
but it's not selected on 32-bit architectures. This rationale isn't as
applicable to WebAssembly targets because despite the 32-bit pointer
width there are often wider-than-pointer operations available as it's
typically run on 64-bit machines.

Locally in testing a benchmark that performs division with a Rust-based
bignum libraries whent from 350% slower-than-native to 220%
slower-than-native with this change, a nice increase in speed. While
this was tested with Wasmtime other runtimes are likely to see an
improvement as well.
---
 src/int/specialized_div_rem/mod.rs | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/int/specialized_div_rem/mod.rs b/src/int/specialized_div_rem/mod.rs
index 760f5f5b..a91fe663 100644
--- a/src/int/specialized_div_rem/mod.rs
+++ b/src/int/specialized_div_rem/mod.rs
@@ -136,9 +136,15 @@ fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
 
 // Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
 // microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
-// faster if the target pointer width is at least 64.
+// faster if the target pointer width is at least 64. Note that this
+// implementation is additionally included on WebAssembly despite the typical
+// pointer width there being 32 because it's typically run on a 64-bit machine
+// that has access to faster 64-bit operations.
 #[cfg(all(
-    not(any(target_pointer_width = "16", target_pointer_width = "32")),
+    any(
+        target_family = "wasm",
+        not(any(target_pointer_width = "16", target_pointer_width = "32")),
+    ),
     not(all(not(feature = "no-asm"), target_arch = "x86_64")),
     not(any(target_arch = "sparc", target_arch = "sparc64"))
 ))]
@@ -152,10 +158,14 @@ impl_trifecta!(
     u128
 );
 
-// If the pointer width less than 64, then the target architecture almost certainly does not have
-// the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
+// If the pointer width less than 64 and this isn't wasm, then the target
+// architecture almost certainly does not have the fast 64 to 128 bit widening
+// multiplication needed for `trifecta` to be faster.
 #[cfg(all(
-    any(target_pointer_width = "16", target_pointer_width = "32"),
+    not(any(
+        target_family = "wasm",
+        not(any(target_pointer_width = "16", target_pointer_width = "32")),
+    )),
     not(all(not(feature = "no-asm"), target_arch = "x86_64")),
     not(any(target_arch = "sparc", target_arch = "sparc64"))
 ))]