From e5ddb686d54cf755f61ed4a7cd9f0195984cf170 Mon Sep 17 00:00:00 2001
From: Stephen Canon <scanon@apple.com>
Date: Mon, 14 Jun 2021 16:22:32 -0400
Subject: [PATCH] Use the custom implementation of multipliedFullWidth on
 arm64_32

Previously we were falling back on the generic implementation for 64b integers, which resulted in the following codegen:

00000008	asr	x8, x0, #32
0000000c	asr	x9, x0, #63
00000010	cmp	x0, #0x0
00000014	cinv	w10, w0, lt
00000018	eor	w9, w10, w9
0000001c	asr	x10, x1, #32
00000020	asr	x11, x1, #63
00000024	cmp	x1, #0x0
00000028	cinv	w12, w1, lt
0000002c	eor	w11, w12, w11
00000030	umull	x12, w11, w9
00000034	mul	x11, x11, x8
00000038	add	x11, x11, x12, lsr #32
0000003c	asr	x12, x11, #63
00000040	cmp	x11, #0x0
00000044	cinv	w13, w11, lt
00000048	eor	w12, w13, w12
0000004c	madd	x9, x9, x10, x12
00000050	mul	x8, x10, x8
00000054	add	x8, x8, x11, asr #32
00000058	add	x0, x8, x9, asr #32
0000005c	ret

Instead, we should use the 64b implementation when targeting arm64_32, which allows us to generate:

00000008	smulh	x0, x1, x0
0000000c	ret

Unsurprisingly, this is considerably faster.
---
 stdlib/public/core/IntegerTypes.swift.gyb | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/stdlib/public/core/IntegerTypes.swift.gyb b/stdlib/public/core/IntegerTypes.swift.gyb
index 96f21a011976b..b4b2083b9df0d 100644
--- a/stdlib/public/core/IntegerTypes.swift.gyb
+++ b/stdlib/public/core/IntegerTypes.swift.gyb
@@ -1514,15 +1514,23 @@ ${assignmentOperatorComment(x.operator, True)}
 % end
 
 %   dbits = bits*2
-%   if bits <= word_bits:
+%   if bits == 64:
+  #if !(arch(arm) || arch(i386) || arch(wasm32))
+  //  On 32b architectures we fall back on the generic implementation,
+  //  because LLVM doesn't know how to codegen the 128b multiply we use.
+  //
+  //  Note that arm64_32 is a 64b architecture for the purposes of this
+  //  check, because we have a 64x64 -> 128 multiply there (the actual
+  //  ISA is AArch64).
+%   end
   /// Returns a tuple containing the high and low parts of the result of
   /// multiplying this value by the given value.
   ///
   /// Use this method to calculate the full result of a product that would
   /// otherwise overflow. Unlike traditional truncating multiplication, the
-  /// `multipliedFullWidth(by:)` method returns a tuple
-  /// containing both the `high` and `low` parts of the product of this value and
-  /// `other`. The following example uses this method to multiply two `UInt8`
+  /// `multipliedFullWidth(by:)` method returns a tuple containing both the
+  /// `high` and `low` parts of the product of this value and `other`.
+  /// The following example uses this method to multiply two `UInt8`
   /// values that normally overflow when multiplied:
   ///
   ///     let x: UInt8 = 100
@@ -1557,6 +1565,8 @@ ${assignmentOperatorComment(x.operator, True)}
     let high = ${Self}(Builtin.truncOrBitCast_Int${dbits}_Int${bits}(shifted))
     return (high: high, low: low)
   }
+%   if bits == 64:
+  #endif
 %   end
 
   /// Returns a tuple containing the quotient and remainder of dividing the