diff --git a/changelog/druntime.importc-initial-msvc-intrinsics.dd b/changelog/druntime.importc-initial-msvc-intrinsics.dd
new file mode 100644
index 000000000000..25f293126e99
--- /dev/null
+++ b/changelog/druntime.importc-initial-msvc-intrinsics.dd
@@ -0,0 +1,4 @@
+ImportC implements all but three of the MSVC intrinsics [listed here](https://web.archive.org/web/20240412171516/https://learn.microsoft.com/en-ie/cpp/intrinsics/alphabetical-listing-of-intrinsic-functions?view=msvc-170), and a handful of undocumented intrinsics.
+
+ImportC now implements all but three of the MSVC intrinsics [listed here](https://web.archive.org/web/20240412171516/https://learn.microsoft.com/en-ie/cpp/intrinsics/alphabetical-listing-of-intrinsic-functions?view=msvc-170), and a handful of undocumented intrinsics.
+The MSVC intrinsics are available when targeting the Microsoft C runtime (that is, when the `CRuntime_Microsoft` version identifier is defined).
diff --git a/compiler/src/dmd/cpreprocess.d b/compiler/src/dmd/cpreprocess.d
index ed97bfb0e62d..86a94610ebdd 100644
--- a/compiler/src/dmd/cpreprocess.d
+++ b/compiler/src/dmd/cpreprocess.d
@@ -67,9 +67,11 @@ DArray!ubyte preprocess(FileName csrcfile, ref const Loc loc, ref OutBuffer defi
     //printf("preprocess %s\n", csrcfile.toChars());
     version (runPreprocessor)
     {
+        const includePath = FileName.replaceName(toDString(importc_h), "include");
         const command = global.params.cpp ? toDString(global.params.cpp) : cppCommand();
         DArray!ubyte text;
-        int status = runPreprocessor(loc, command, csrcfile.toString(), importc_h, global.params.cppswitches, global.params.v.verbose, global.errorSink, defines, text);
+        int status = runPreprocessor(loc, command, csrcfile.toString(), importc_h, includePath, global.params.cppswitches, global.params.v.verbose, global.errorSink, defines, text);
+        FileName.free(includePath.ptr);
         if (status)
             fatal();
         return text;
diff --git a/compiler/src/dmd/link.d b/compiler/src/dmd/link.d
index 7ac403ba942c..77e917c5a06c 100644
--- a/compiler/src/dmd/link.d
+++ b/compiler/src/dmd/link.d
@@ -940,6 +940,7 @@ public int runProgram(const char[] exefile, const char*[] runargs, bool verbose,
  *    cpp = name of C preprocessor program
  *    filename = C source file name
  *    importc_h = filename of importc.h
+ *    includePath = path passed to the preprocessor as an include path
  *    cppswitches = array of switches to pass to C preprocessor
  *    verbose = print progress to eSink
  *    eSink = for verbose messages and error messages
@@ -948,7 +949,7 @@ public int runProgram(const char[] exefile, const char*[] runargs, bool verbose,
  * Returns:
  *    error status, 0 for success
  */
-public int runPreprocessor(ref const Loc loc, const(char)[] cpp, const(char)[] filename, const(char)* importc_h, ref Array!(const(char)*) cppswitches,
+public int runPreprocessor(ref const Loc loc, const(char)[] cpp, const(char)[] filename, const(char)* importc_h, const(char)[] includePath, ref Array!(const(char)*) cppswitches,
     bool verbose, ErrorSink eSink, ref OutBuffer defines, out DArray!ubyte text)
 {
     //printf("runPreprocessor() cpp: %.*s filename: %.*s\n", cast(int)cpp.length, cpp.ptr, cast(int)filename.length, filename.ptr);
@@ -994,6 +995,14 @@ public int runPreprocessor(ref const Loc loc, const(char)[] cpp, const(char)[] f
                 buf.printf(" /P /Zc:preprocessor /PD /nologo /utf-8 %.*s /FI%s /Fi%.*s",
                     cast(int)filename.length, filename.ptr, importc_h, cast(int)output.length, output.ptr);
 
+                /* Append the include path, if it was provided
+                 */
+                if (includePath)
+                {
+                    buf.write(" /I");
+                    buf.write(includePath);
+                }
+
                 /* Append preprocessor switches to command line
                  */
                 foreach (a; cppswitches)
@@ -1159,6 +1168,13 @@ public int runPreprocessor(ref const Loc loc, const(char)[] cpp, const(char)[] f
         // need to redefine some macros in importc.h
         argv.push("-Wno-builtin-macro-redefined");
 
+        // append the include path, if it was provided
+        if (includePath)
+        {
+            argv.push("-I");
+            argv.push(includePath.xarraydup.ptr);
+        }
+
         if (target.os == Target.OS.OSX)
         {
             argv.push("-fno-blocks");       // disable clang blocks extension
diff --git a/compiler/test/compilable/msvc_intrinsics.c b/compiler/test/compilable/msvc_intrinsics.c
new file mode 100644
index 000000000000..bdd16b0a7299
--- /dev/null
+++ b/compiler/test/compilable/msvc_intrinsics.c
@@ -0,0 +1,42 @@
+// LINK(windows):
+// REQUIRED_ARGS: -os=windows
+// PERMUTE_ARGS: -betterC -i
+// Checking that the MSVC intrinsics reimplemented for ImportC are actually available from C.
+
+#include <importc_msvc_builtins.h>
+
+#ifndef __IMPORTC_MSVC_BUILTINS__
+#error importc_msvc_builtins.h should define __IMPORTC_MSVC_BUILTINS__.
+#endif
+
+// It should be safe to include importc_msvc_builtins.h multiple times.
+#include <importc_msvc_builtins.h>
+
+// Are the MSVC intrinsics actually usable from C?
+#if defined(_M_AMD64)
+unsigned long long multiplyU128(unsigned long long a, unsigned long long b, unsigned long long* high)
+{
+    return _umul128(a, b, high);
+}
+#elif defined(_M_IX86)
+int interlockedAddLarge(long long *target, int value)
+{
+    return _InterlockedAddLargeStatistic(target, value);
+}
+#elif defined(_M_ARM64)
+unsigned long long multiplyUHigh64(unsigned long long a, unsigned long long b)
+{
+    return __umulh(a, b);
+}
+#elif defined(_M_ARM)
+void dmb(void)
+{
+    __dmb(11);
+}
+#endif
+
+// Just so the linker doesn't complain.
+int main(void)
+{
+    return 0;
+}
diff --git a/druntime/mak/COPY b/druntime/mak/COPY
index 67d36d1cab7a..dc8f41842508 100644
--- a/druntime/mak/COPY
+++ b/druntime/mak/COPY
@@ -1,7 +1,9 @@
 COPY=\
 	$(IMPDIR)\object.d \
 	$(IMPDIR)\__importc_builtins.di \
+	$(IMPDIR)\__builtins_msvc.d \
 	$(IMPDIR)\importc.h \
+	$(IMPDIR)\include\importc_msvc_builtins.h \
 	\
 	$(IMPDIR)\core\gc\config.d \
 	$(IMPDIR)\core\gc\gcinterface.d \
diff --git a/druntime/src/__builtins_msvc.d b/druntime/src/__builtins_msvc.d
new file mode 100644
index 000000000000..660a4d943c5f
--- /dev/null
+++ b/druntime/src/__builtins_msvc.d
@@ -0,0 +1,13488 @@
+/* This file contains D reimplementations of some of the intrinsics recognised
+   by the MSVC compiler, for ImportC.
+   This module is intended for only internal use, hence the leading double underscore.
+
+   Copyright: Copyright D Language Foundation 2024-2024
+   License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
+   Authors: Harry Gillanders
+   Source: $(DRUNTIMESRC __builtins_msvc.d) */
+
+module __builtins_msvc;
+
+version (CRuntime_Microsoft)
+{
+    version = MSVCIntrinsics;
+}
+
+version (MSVCIntrinsics)
+{
+    version (X86)
+    {
+        version = X86_64_Or_X86;
+    }
+    else version (X86_64)
+    {
+        version = X86_64_Or_X86;
+        version = X86_64_Or_AArch64;
+        version = X86_64_Or_AArch64_Or_ARM;
+    }
+    else version (AArch64)
+    {
+        version = X86_64_Or_AArch64;
+        version = X86_64_Or_AArch64_Or_ARM;
+        version = AArch64_Or_ARM;
+    }
+    else version (ARM)
+    {
+        version = X86_64_Or_AArch64_Or_ARM;
+        version = AArch64_Or_ARM;
+    }
+
+    version (D_InlineAsm_X86)
+    {
+        version = InlineAsm_X86_64_Or_X86;
+    }
+    else version (D_InlineAsm_X86_64)
+    {
+        version = InlineAsm_X86_64_Or_X86;
+    }
+
+    version (LDC)
+    {
+        version = LDC_Or_GNU;
+
+             version (X86_64_Or_X86) private enum gccBuiltins = "ldc.gccbuiltins_x86";
+        else version (ARM) private enum gccBuiltins = "ldc.gccbuiltins_arm";
+        else version (AArch64) private enum gccBuiltins = "ldc.gccbuiltins_aarch64";
+    }
+    else version (GNU)
+    {
+        version = LDC_Or_GNU;
+
+        private enum gccBuiltins = "gcc.builtins";
+    }
+
+    import core.atomic : MemoryOrder;
+
+    static if (__traits(compiles, () {import core.simd : float4;}))
+    {
+        import core.simd : byte16, float4, long2, int4, ubyte16;
+
+        version (X86_64_Or_X86)
+        {
+            import core.simd : double2;
+        }
+
+        private enum canPassVectors = true;
+    }
+    else
+    {
+        private enum canPassVectors = false;
+    }
+
+    version (LDC)
+    {
+        version (X86_64_Or_X86)
+        {
+            pragma(LDC_intrinsic, "llvm.x86.sse2.pause")
+            private void __builtin_ia32_pause() @safe pure nothrow @nogc;
+
+            pragma(LDC_intrinsic, "llvm.x86.rdpmc")
+            private long __builtin_ia32_rdpmc(int) @safe nothrow @nogc;
+
+            pragma(LDC_intrinsic, "llvm.x86.rdtsc")
+            private long __builtin_ia32_rdtsc() @safe nothrow @nogc;
+
+            pragma(LDC_intrinsic, "llvm.x86.wbinvd")
+            private void __builtin_ia32_wbinvd() @safe nothrow @nogc;
+        }
+        else version (AArch64)
+        {
+            pragma(LDC_intrinsic, "llvm.aarch64.dmb")
+            private void __builtin_arm_dmb(int) @safe pure nothrow @nogc;
+
+            pragma(LDC_intrinsic, "llvm.aarch64.hint")
+            private void llvm_arm_hint(int) @safe pure nothrow @nogc;
+        }
+        else version (ARM)
+        {
+            pragma(LDC_intrinsic, "llvm.arm.dmb")
+            private void __builtin_arm_dmb(int) @safe pure nothrow @nogc;
+
+            pragma(LDC_intrinsic, "llvm.arm.hint")
+            private void llvm_arm_hint(int) @safe pure nothrow @nogc;
+        }
+    }
+    else version (GNU)
+    {
+        version (X86_64_Or_X86)
+        {
+            import gcc.builtins :
+                __builtin_ia32_pause, __builtin_ia32_rdpmc, __builtin_ia32_rdtsc, __builtin_ia32_wbinvd;
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        version (X86_64)
+        {
+            private alias RegisterSized = ulong;
+        }
+        else version (X86)
+        {
+            private alias RegisterSized = uint;
+        }
+    }
+
+    version (X86_64_Or_AArch64)
+    {
+        import core.internal.traits : AliasSeq;
+    }
+
+    version (LDC)
+    {
+        private template llvmIRPtr(string type, string postfix = null)
+        {
+            version (LDC_LLVM_OpaquePointers)
+            {
+                enum llvmIRPtr = postfix is null ? "ptr" : "ptr " ~ postfix;
+            }
+            else
+            {
+                enum llvmIRPtr = postfix is null ? type ~ "*" : type ~ " " ~ postfix ~ "*";
+            }
+        }
+    }
+
+    version (X86_64_Or_AArch64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ulong __umulh(ulong a, ulong b) @safe pure nothrow @nogc
+        {
+            return multiplyWithDoubleWidthProduct!(ulong, true)(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long __mulh(long a, long b) @safe pure nothrow @nogc
+        {
+            return multiplyWithDoubleWidthProduct!(long, true)(a, b);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ulong _umul128(ulong Multiplier, ulong Multiplicand, scope ulong* HighProduct) @safe pure nothrow @nogc
+        {
+            return multiplyWithDoubleWidthProduct!(ulong, false)(Multiplier, Multiplicand, HighProduct);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _mul128(long Multiplier, long Multiplicand, scope long* HighProduct) @safe pure nothrow @nogc
+        {
+            return multiplyWithDoubleWidthProduct!(long, false)(Multiplier, Multiplicand, HighProduct);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        long __emul(int a, int b) @safe pure nothrow @nogc
+        {
+            return long(a) * b;
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ulong __emulu(uint a, uint b) @safe pure nothrow @nogc
+        {
+            return ulong(a) * b;
+        }
+    }
+
+    /* This is trusted so that it's @safe without DIP1000 enabled. */
+    @trusted pure nothrow @nogc unittest
+    {
+        static bool test()
+        {
+            version (X86_64_Or_X86)
+            {
+                assert(__emul(7, -5) == -35);
+                assert(__emul(-11, 13) == -143);
+                assert(__emul(0x00FFFFFF, 1 << 16) == 0xFF_FFFF0000);
+
+                assert(__emulu(7, 5) == 35);
+                assert(__emulu(11, 13) == 143);
+                assert(__emulu(0xFFFFFFFF, 1 << 8) == 0xFF_FFFFFF00);
+            }
+
+            version (X86_64)
+            {
+                {
+                    long hi = 3;
+                    assert(_mul128(7, -5, &hi) == -35);
+                    assert(hi == -1);
+                    assert(_mul128(-11, 13, &hi) == -143);
+                    assert(hi == -1);
+                    assert(_mul128(0x00FFFFFF, 1 << 16, &hi) == 0xFF_FFFF0000);
+                    assert(hi == 0);
+                    assert(_mul128(0x00FFFFFF_FFFFFFFF, long(1) << 32, &hi) == 0xFFFFFFFF_00000000);
+                    assert(hi == 0x00FFFFFF);
+                }
+
+                {
+                    ulong hi = 3;
+                    assert(_umul128(7, 5, &hi) == 35);
+                    assert(hi == 0);
+                    assert(_umul128(11, 13, &hi) == 143);
+                    assert(hi == 0);
+                    assert(_umul128(0x00FFFFFF, 1 << 16, &hi) == 0xFF_FFFF0000);
+                    assert(hi == 0);
+                    assert(_umul128(0xFFFFFFFF_FFFFFFFF, long(1) << 32, &hi) == 0xFFFFFFFF_00000000);
+                    assert(hi == 0xFFFFFFFF);
+                }
+            }
+
+            version (X86_64_Or_AArch64)
+            {
+                assert(__mulh(7, -5) == -1);
+                assert(__mulh(-11, 13) == -1);
+                assert(__mulh(0x00FFFFFF, 1 << 16) == 0);
+                assert(__mulh(0x00FFFFFF_FFFFFFFF, long(1) << 32) == 0x00FFFFFF);
+
+                assert(__umulh(7, 5) == 0);
+                assert(__umulh(11, 13) == 0);
+                assert(__umulh(0x00FFFFFF, 1 << 16) == 0);
+                assert(__umulh(0xFFFFFFFF_FFFFFFFF, long(1) << 32) == 0xFFFFFFFF);
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    version (X86_64_Or_AArch64)
+    {
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        private I multiplyWithDoubleWidthProduct(I, bool onlyHighHalf)(
+            I low,
+            I high,
+            scope AliasSeq!(I*)[0 .. !onlyHighHalf] highProduct
+        ) @trusted
+        if (is(I == ulong) || is(I == long))
+        {
+            enum bool unsigned = is(I == ulong);
+
+            static if (unsigned)
+            {
+                alias multiplyViaSoftware = unsignedMultiplyWithDoubleWidthProduct;
+            }
+            else
+            {
+                alias multiplyViaSoftware = signedMultiplyWithDoubleWidthProduct;
+            }
+
+            if (__ctfe)
+            {
+                return multiplyViaSoftware!(I, onlyHighHalf)(low, high, highProduct);
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import ldc.llvmasm : __ir_pure;
+
+                    enum ptr = llvmIRPtr!"i64";
+                    enum ext = unsigned ? "zext" : "sext";
+
+                    I a = low;
+                    I b = high;
+                    I lo;
+                    I hi;
+
+                    __ir_pure!(
+                        "%a = " ~ ext ~ " i64 %0 to i128
+                         %b = " ~ ext ~ " i64 %1 to i128
+
+                         %product = mul i128 %a, %b
+
+                        " ~ (onlyHighHalf ? "" : "%lo = trunc i128 %product to i64\n")
+
+                        ~ "%hi128 = lshr i128 %product, 64
+                         %hi = trunc i128 %hi128 to i64
+
+                        " ~ (onlyHighHalf ? "" : "store i64 %lo, " ~ ptr ~ " %2\n")
+                        ~ "store i64 %hi, " ~ ptr ~ " %3",
+                        void
+                    )(a, b, &lo, &hi);
+
+                    static if (onlyHighHalf)
+                    {
+                        return hi;
+                    }
+                    else
+                    {
+                        *highProduct[0] = hi;
+                        return lo;
+                    }
+                }
+                else version (GNU)
+                {
+                    I lo;
+                    I hi;
+
+                    version (X86_64)
+                    {
+                        /* for unsigned operands; if we have PEXT, then the target has BMI2, ergo we can use MULX. */
+                        static if (unsigned && __traits(compiles, () {import gcc.builtins : __builtin_ia32_pext_si;}))
+                        {
+                            asm @trusted pure nothrow @nogc
+                            {
+                                  "mulx %2, %0, %1"
+                                : "=r" (lo), "=r" (hi)
+                                : "rm" (low), "d" (high);
+                            }
+                        }
+                        else
+                        {
+                            asm @trusted pure nothrow @nogc
+                            {
+                                  (unsigned ? "mul" : "imul") ~ " %3"
+                                : "=a" (lo), "=d" (hi)
+                                : "%0" (low), "rm" (high)
+                                : "cc";
+                            }
+                        }
+                    }
+                    else version (AArch64)
+                    {
+                        static if (!onlyHighHalf)
+                        {
+                            asm @trusted pure nothrow @nogc
+                            {
+                                  "mul %0, %1, %2"
+                                : "=r" (lo)
+                                : "%r" (low), "r" (high);
+                            }
+                        }
+
+                        asm @trusted pure nothrow @nogc
+                        {
+                              "umulh %0, %1, %2"
+                            : "=r" (hi)
+                            : "%r" (low), "r" (high);
+                        }
+                    }
+
+                    static if (onlyHighHalf)
+                    {
+                        return hi;
+                    }
+                    else
+                    {
+                        *highProduct[0] = hi;
+                        return lo;
+                    }
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    mixin(
+                        "asm @trusted pure nothrow @nogc
+                         {
+                             /* RCX is low; RDX is high; R8 is highProduct, if present. */
+                             naked;
+                             mov RAX, RCX;
+                             " ~ (unsigned ? "mul" : "imul") ~ " RDX;
+                             mov " ~ (onlyHighHalf ? "RAX" : "[R8]") ~ ", RDX;
+                             ret;
+                         }"
+                    );
+                }
+                else
+                {
+                    return multiplyViaSoftware!(I, onlyHighHalf)(low, high, highProduct);
+                }
+            }
+        }
+
+        pragma(inline, true)
+        private I unsignedMultiplyWithDoubleWidthProduct(I, bool onlyHighHalf)(
+            I low,
+            I high,
+            scope AliasSeq!(I*)[0 .. !onlyHighHalf] highProduct
+        ) @safe pure nothrow @nogc
+        if (__traits(isIntegral, I) && __traits(isUnsigned, I))
+        {
+            enum uint halfWidth = I.sizeof << 2;
+            enum I lowerHalf = (cast(I) ~I(0)) >>> halfWidth;
+
+            auto first = low & lowerHalf;
+            auto second = low >>> halfWidth;
+            auto third = high & lowerHalf;
+            auto fourth = high >>> halfWidth;
+
+            I lowest = cast(I) (cast(I) first * cast(I) third);
+            I lower = cast(I) (cast(I) first * cast(I) fourth);
+            I higher = cast(I) (cast(I) second * cast(I) third);
+            I highest = cast(I) (cast(I) second * cast(I) fourth);
+
+            I middle = cast(I) ((higher & lowerHalf) + lower + (lowest >>> halfWidth));
+            static if (!onlyHighHalf) I bottom = cast(I) ((middle << halfWidth) + (lowest & lowerHalf));
+            I top = cast(I) (highest + (higher >>> halfWidth) + (middle >>> halfWidth));
+
+            static if (onlyHighHalf)
+            {
+                return top;
+            }
+            else
+            {
+                *highProduct[0] = top;
+                return bottom;
+            }
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        @trusted pure nothrow @nogc unittest
+        {
+            /* The mechanics used to get a double-width product from two operands are the same regardless of the width.
+               So, if this works for 8x8->16-bit multiplication, it'll work for 64x64->128-bit multiplication. */
+
+            ubyte left = 0;
+            ubyte right = 0;
+
+            do
+            {
+                do
+                {
+                    ushort expectedResult = left * right;
+
+                    ubyte hi = left;
+                    ubyte lo = right;
+                    lo = unsignedMultiplyWithDoubleWidthProduct!(ubyte, false)(lo, hi, &hi);
+
+                    assert(((ushort(hi) << 8) | lo) == expectedResult);
+                    assert(unsignedMultiplyWithDoubleWidthProduct!(ubyte, true)(left, right) == hi);
+
+                    ++right;
+                }
+                while (right != 0);
+
+                ++left;
+            }
+            while (left != 0);
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        pragma(inline, true)
+        private I signedMultiplyWithDoubleWidthProduct(I, bool onlyHighHalf)(
+            I low,
+            I high,
+            scope AliasSeq!(I*)[0 .. !onlyHighHalf] highProduct
+        ) @trusted pure nothrow @nogc
+        if (__traits(isIntegral, I) && !__traits(isUnsigned, I))
+        {
+            import core.bitop : bsr;
+
+            alias UnsignedI = AliasSeq!(ubyte, ushort, uint, ulong)[I.sizeof.bsr];
+
+            UnsignedI lo = cast(UnsignedI) low;
+            UnsignedI hi = cast(UnsignedI) high;
+
+            static if (onlyHighHalf)
+            {
+                hi = unsignedMultiplyWithDoubleWidthProduct!(UnsignedI, true)(lo, hi);
+            }
+            else
+            {
+                lo = unsignedMultiplyWithDoubleWidthProduct!(UnsignedI, false)(lo, hi, &hi);
+            }
+
+            hi -= high * (low < 0);
+            hi -= low * (high < 0);
+
+            static if (onlyHighHalf)
+            {
+                return hi;
+            }
+            else
+            {
+                *highProduct[0] = hi;
+                return lo;
+            }
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        @trusted pure nothrow @nogc unittest
+        {
+            /* The mechanics used to get a double-width product from two operands are the same regardless of the width.
+               So, if this works for 8x8->16-bit multiplication, it'll work for 64x64->128-bit multiplication. */
+
+            byte left = byte.min;
+            byte right = byte.min;
+
+            do
+            {
+                do
+                {
+                    short expectedResult = left * right;
+
+                    byte hi = left;
+                    byte lo = right;
+                    lo = signedMultiplyWithDoubleWidthProduct!(byte, false)(lo, hi, &hi);
+
+                    assert(cast(short) (((short(hi) << 8) & 0xFF00) | (lo & 0x00FF)) == expectedResult);
+                    assert(signedMultiplyWithDoubleWidthProduct!(byte, true)(left, right) == hi);
+
+                    ++right;
+                }
+                while (right != byte.min);
+
+                ++left;
+            }
+            while (left != byte.min);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        long _div128(long highDividend, long lowDividend, long divisor, scope long* remainder) @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                /* This is an amalgamation of core.int128.divmod and core.int128.neg.  */
+
+                if (highDividend < 0)
+                {
+                    if (lowDividend == 0)
+                    {
+                        highDividend = -highDividend;
+                    }
+                    else
+                    {
+                        lowDividend = -lowDividend;
+                        highDividend = ~highDividend;
+                    }
+
+                    ulong quotient;
+
+                    if (divisor < 0)
+                    {
+                        quotient =  _udiv128(
+                            cast(ulong) highDividend,
+                            cast(ulong) lowDividend,
+                            cast(ulong) -divisor,
+                            cast(ulong*) remainder
+                        );
+                    }
+                    else
+                    {
+                        quotient =  -_udiv128(
+                            cast(ulong) highDividend,
+                            cast(ulong) lowDividend,
+                            cast(ulong) divisor,
+                            cast(ulong*) remainder
+                        );
+                    }
+
+                    *remainder = -*remainder;
+                    return quotient;
+                }
+                else if (divisor < 0)
+                {
+                    return -_udiv128(
+                        cast(ulong) highDividend,
+                        cast(ulong) lowDividend,
+                        cast(ulong) -divisor,
+                        cast(ulong*) remainder
+                    );
+                }
+                else
+                {
+                    return _udiv128(
+                        cast(ulong) highDividend,
+                        cast(ulong) lowDividend,
+                        cast(ulong) divisor,
+                        cast(ulong*) remainder
+                    );
+                }
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import ldc.llvmasm : __ir_pure;
+
+                    return __ir_pure!(
+                        `%result = call {i64, i64} asm
+                             "idiv $4",
+                             "={rax},={rdx},0,1,r,~{flags}"
+                             (i64 %1, i64 %0, i64 %2)
+
+                         %quotient = extractvalue {i64, i64} %result, 0
+                         %remainder = extractvalue {i64, i64} %result, 1
+
+                         store i64 %remainder, ` ~ llvmIRPtr!"i64" ~ ` %3
+                         ret i64 %quotient`,
+                        long
+                    )(highDividend, lowDividend, divisor, remainder);
+                }
+                else version (GNU)
+                {
+                    long quotient;
+                    long remainer;
+
+                    asm @trusted pure nothrow @nogc
+                    {
+                          "idiv %4"
+                        : "=a" (quotient), "=d" (remainer)
+                        : "0" (lowDividend), "1" (highDividend), "rm" (divisor)
+                        : "cc";
+                    }
+
+                    *remainder = remainer;
+                    return quotient;
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        /* RCX is highDividend; RDX is lowDividend; R8 is divisor. R9 is remainder. */
+                        naked;
+                        mov RAX, RDX;
+                        mov RDX, RCX;
+                        idiv R8;
+                        mov [R9], RDX;
+                        ret;
+                    }
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ulong _udiv128(ulong highDividend, ulong lowDividend, ulong divisor, scope ulong* remainder)
+        @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                // This code was copied and adapted from core.int128.udivmod.udivmod128_64.
+
+                import core.bitop : bsr;
+
+                alias U = ulong;
+                alias I = long;
+                enum uint Ubits = 64;
+                // We work in base 2^^32
+                enum base = 1UL << 32;
+                enum divmask = (1UL << (Ubits / 2)) - 1;
+                enum divshift = Ubits / 2;
+
+                // Check for overflow and divide by 0
+                if (highDividend >= divisor)
+                {
+                    // The div instruction will raise a #DE exception on overflow or division-by-zero,
+                    // so during CTFE we'll just assert false.
+                    version (D_BetterC)
+                    {
+                        assert(false, "Division by zero, or an overflow of the 64-bit quotient occurred in _udiv128.");
+                    }
+                    else
+                    {
+                        import core.internal.string : unsignedToTempString;
+                        assert(
+                            false,
+                              "Division by zero, or an overflow of the 64-bit quotient occurred in _udiv128."
+                            ~ " highDividend: 0x" ~ unsignedToTempString!16(highDividend)
+                            ~ "; lowDividend: 0x" ~ unsignedToTempString!16(lowDividend)
+                            ~ "; divisor: 0x" ~ unsignedToTempString!16(divisor)
+                        );
+                    }
+                }
+
+                // Computes [num1 num0] / den
+                static uint udiv96_64(U num1, uint num0, U den)
+                {
+                    // Extract both digits of the denominator
+                    const den1 = cast(uint)(den >> divshift);
+                    const den0 = cast(uint)(den & divmask);
+                    // Estimate ret as num1 / den1, and then correct it
+                    U ret = num1 / den1;
+                    const t2 = (num1 % den1) * base + num0;
+                    const t1 = ret * den0;
+                    if (t1 > t2)
+                        ret -= (t1 - t2 > den) ? 2 : 1;
+                    return cast(uint)ret;
+                }
+
+                // Determine the normalization factor. We multiply divisor by this, so that its leading
+                // digit is at least half base. In binary this means just shifting left by the number
+                // of leading zeros, so that there's a 1 in the MSB.
+                // We also shift number by the same amount. This cannot overflow because highDividend < divisor.
+                const shift = (Ubits - 1) - bsr(divisor);
+                divisor <<= shift;
+                U num2 = highDividend;
+                num2 <<= shift;
+                num2 |= (lowDividend >> (-shift & 63)) & (-cast(I)shift >> 63);
+                lowDividend <<= shift;
+
+                // Extract the low digits of the numerator (after normalizing)
+                const num1 = cast(uint)(lowDividend >> divshift);
+                const num0 = cast(uint)(lowDividend & divmask);
+
+                // Compute q1 = [num2 num1] / divisor
+                const q1 = udiv96_64(num2, num1, divisor);
+                // Compute the true (partial) remainder
+                const rem = num2 * base + num1 - q1 * divisor;
+                // Compute q0 = [rem num0] / divisor
+                const q0 = udiv96_64(rem, num0, divisor);
+
+                *remainder = (rem * base + num0 - q0 * divisor) >> shift;
+                return (cast(U)q1 << divshift) | q0;
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import ldc.llvmasm : __ir_pure;
+
+                    return __ir_pure!(
+                        `%result = call {i64, i64} asm
+                             "div $4",
+                             "={rax},={rdx},0,1,r,~{flags}"
+                             (i64 %1, i64 %0, i64 %2)
+
+                         %quotient = extractvalue {i64, i64} %result, 0
+                         %remainder = extractvalue {i64, i64} %result, 1
+
+                         store i64 %remainder, ` ~ llvmIRPtr!"i64" ~ ` %3
+                         ret i64 %quotient`,
+                        ulong
+                    )(highDividend, lowDividend, divisor, remainder);
+                }
+                else version (GNU)
+                {
+                    ulong quotient;
+                    ulong remainer;
+
+                    asm @trusted pure nothrow @nogc
+                    {
+                          "div %4"
+                        : "=a" (quotient), "=d" (remainer)
+                        : "0" (lowDividend), "1" (highDividend), "rm" (divisor)
+                        : "cc";
+                    }
+
+                    *remainder = remainer;
+                    return quotient;
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        /* RCX is highDividend; RDX is lowDividend; R8 is divisor. R9 is remainder. */
+                        naked;
+                        mov RAX, RDX;
+                        mov RDX, RCX;
+                        div R8;
+                        mov [R9], RDX;
+                        ret;
+                    }
+                }
+            }
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _div64(long dividend, int divisor, scope int* remainder) @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                if (((dividend < 0 ? -dividend : dividend) >>> 32) >= (divisor < 0 ? -divisor : divisor))
+                {
+                    /* The div instruction will raise a #DE exception on overflow or division-by-zero,
+                       so during CTFE we'll just assert false. */
+                    version (D_BetterC)
+                    {
+                        assert(false, "Division by zero, or an overflow of the 32-bit quotient occurred in _div64.");
+                    }
+                    else
+                    {
+                        import core.internal.string : signedToTempString;
+                        assert(
+                            false,
+                              "Division by zero, or an overflow of the 32-bit quotient occurred in _div64."
+                            ~ " dividend: " ~ signedToTempString(dividend)
+                            ~ "; divisor: " ~ signedToTempString(divisor)
+                        );
+                    }
+                }
+
+                *remainder = cast(int) (dividend % divisor);
+                return cast(int) (dividend / divisor);
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import ldc.llvmasm : __ir_pure;
+
+                    return __ir_pure!(
+                        `%result = call {i32, i32} asm
+                             "idiv $4",
+                             "={eax},={edx},0,1,r,~{flags}"
+                             (i32 %1, i32 %0, i32 %2)
+
+                         %quotient = extractvalue {i32, i32} %result, 0
+                         %remainder = extractvalue {i32, i32} %result, 1
+
+                         store i32 %remainder, ` ~ llvmIRPtr!"i32" ~ ` %3
+                         ret i32
+                          %quotient`,
+                        int
+                    )(cast(int) (dividend >>> 32), cast(int) (dividend & 0xFFFFFFFF), divisor, remainder);
+                }
+                else version (GNU)
+                {
+                    int quotient;
+                    int remainer;
+
+                    asm @trusted pure nothrow @nogc
+                    {
+                          "idiv %4"
+                        : "=a" (quotient), "=d" (remainer)
+                        : "0" (cast(int) (dividend & 0xFFFFFFFF)), "1" (cast(int) (dividend >>> 32)), "rm" (divisor)
+                        : "cc";
+                    }
+
+                    *remainder = remainer;
+                    return quotient;
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        /* RCX is dividend; EDX is divisor; R8 is remainder. */
+                        naked;
+                        mov R9D, EDX;
+                        mov RDX, RCX;
+                        shr RDX, 32;
+                        mov EAX, ECX;
+                        idiv R9D;
+                        mov [R8], EDX;
+                        ret;
+                    }
+                }
+                else version (D_InlineAsm_X86)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        naked;
+                        mov EAX, [ESP + 4]; /* Low half of dividend. */
+                        mov EDX, [ESP + 8]; /* High half of dividend. */
+                        idiv dword ptr [ESP + 12]; /* [ESP + 12] is divisor. */
+                        mov ECX, [ESP + 16]; /* remainder. */
+                        mov [ECX], EDX;
+                        ret;
+                    }
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        uint _udiv64(ulong dividend, uint divisor, scope uint* remainder) @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                if ((dividend >>> 32) >= divisor)
+                {
+                    /* The div instruction will raise a #DE exception on overflow or division-by-zero,
+                       so during CTFE we'll just assert false. */
+                    version (D_BetterC)
+                    {
+                        assert(false, "Division by zero, or an overflow of the 32-bit quotient occurred in _udiv64.");
+                    }
+                    else
+                    {
+                        import core.internal.string : unsignedToTempString;
+                        assert(
+                            false,
+                              "Division by zero, or an overflow of the 32-bit quotient occurred in _udiv64."
+                            ~ " dividend: " ~ unsignedToTempString(dividend)
+                            ~ "; divisor: " ~ unsignedToTempString(divisor)
+                        );
+                    }
+                }
+
+                *remainder = cast(uint) (dividend % divisor);
+                return cast(uint) (dividend / divisor);
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import ldc.llvmasm : __ir_pure;
+
+                    return __ir_pure!(
+                        `%result = call {i32, i32} asm
+                             "div $4",
+                             "={eax},={edx},0,1,r,~{flags}"
+                             (i32 %1, i32 %0, i32 %2)
+
+                         %quotient = extractvalue {i32, i32} %result, 0
+                         %remainder = extractvalue {i32, i32} %result, 1
+
+                         store i32 %remainder, ` ~ llvmIRPtr!"i32" ~ ` %3
+                         ret i32
+                          %quotient`,
+                        uint
+                    )(uint(dividend >>> 32), uint(dividend & 0xFFFFFFFF), divisor, remainder);
+                }
+                else version (GNU)
+                {
+                    uint quotient;
+                    uint remainer;
+
+                    asm @trusted pure nothrow @nogc
+                    {
+                          "div %4"
+                        : "=a" (quotient), "=d" (remainer)
+                        : "0" (uint(dividend & 0xFFFFFFFF)), "1" (uint(dividend >>> 32)), "rm" (divisor)
+                        : "cc";
+                    }
+
+                    *remainder = remainer;
+                    return quotient;
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        /* RCX is dividend; EDX is divisor; R8 is remainder. */
+                        naked;
+                        mov R9D, EDX;
+                        mov RDX, RCX;
+                        shr RDX, 32;
+                        mov EAX, ECX;
+                        div R9D;
+                        mov [R8], EDX;
+                        ret;
+                    }
+                }
+                else version (D_InlineAsm_X86)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        naked;
+                        mov EAX, [ESP + 4]; /* Low half of dividend. */
+                        mov EDX, [ESP + 8]; /* High half of dividend. */
+                        div dword ptr [ESP + 12]; /* [ESP + 12] is divisor. */
+                        mov ECX, [ESP + 16]; /* remainder. */
+                        mov [ECX], EDX;
+                        ret;
+                    }
+                }
+            }
+        }
+    }
+
+    /* This is trusted so that it's @safe without DIP1000 enabled. */
+    @trusted pure nothrow @nogc unittest
+    {
+        static bool test()
+        {
+            version (X86_64)
+            {
+                {
+                    ulong remainder;
+                    assert(_udiv128(0x0000CAFE, 0x00F00D00, 1 << 16, &remainder) == 0xCAFE0000_000000F0);
+                    assert(remainder == (0x00F00D00 & ((1 << 16) - 1)));
+                    assert(_udiv128(0x0000CAFE, 0x00F00D00 + (1 << 16), 1 << 16, &remainder) == 0xCAFE0000_000000F1);
+                    assert(remainder == (0x00F00D00 & ((1 << 16) - 1)));
+                }
+
+                {
+                    long remainder;
+                    assert(_div128(0, 9, 4, &remainder) == 2);
+                    assert(remainder == 1);
+                    assert(_div128(0, 9, -4, &remainder) == -2);
+                    assert(remainder == 1);
+                    assert(_div128(-1, -9, 4, &remainder) == -2);
+                    assert(remainder == -1);
+                    assert(_div128(-1, -9, -4, &remainder) == 2);
+                    assert(remainder == -1);
+                    assert(_div128(0x00004AFE, 0x00F10D00, 1 << 16, &remainder) == 0x4AFE0000_000000F1);
+                    assert(remainder == (0x00F10D00 & ((1 << 16) - 1)));
+                }
+            }
+
+            version (X86_64_Or_X86)
+            {
+                {
+                    uint remainder;
+                    assert(_udiv64(9, 4, &remainder) == 2);
+                    assert(remainder == 1);
+                    assert(_udiv64(0x0000CAFE_00001234, 1 << 16, &remainder) == 0x00000000_CAFE0000);
+                    assert(remainder == (0x0000CAFE_00001234 & ((1 << 16) - 1)));
+                }
+
+                {
+                    int remainder;
+                    assert(_div64(9, 4, &remainder) == 2);
+                    assert(remainder == 1);
+                    assert(_div64(9, -4, &remainder) == -2);
+                    assert(remainder == 1);
+                    assert(_div64(-9, 4, &remainder) == -2);
+                    assert(remainder == -1);
+                    assert(_div64(-9, -4, &remainder) == 2);
+                    assert(remainder == -1);
+                    assert(_div64(0x00004AFE_00011234, 1 << 16, &remainder) == 0x00000000_4AFE0001);
+                    assert(remainder == (0x00004AFE_00011234 & ((1 << 16) - 1)));
+                }
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+
+        enum bool errorOccursDuringCTFE(alias symbol, T, T divisor) = !__traits(
+            compiles,
+            ()
+            {
+                T remainder;
+                enum result = symbol(0x0000CAFE, 0x00F00D00, divisor, &remainder);
+            }
+        );
+
+        version (X86_64)
+        {
+            /* An error should occur when attempting to divide by zero. */
+            static assert(errorOccursDuringCTFE!(_udiv128, ulong, 0));
+            static assert(errorOccursDuringCTFE!(_div128, long, 0));
+            /* And, when when the quotient overflows 64-bits. */
+            static assert(errorOccursDuringCTFE!(_udiv128, ulong, 2));
+            static assert(errorOccursDuringCTFE!(_div128, long, 2));
+        }
+
+        version (X86_64_Or_X86)
+        {
+            /* An error should occur when attempting to divide by zero. */
+            static assert(errorOccursDuringCTFE!(_udiv64, uint, 0));
+            static assert(errorOccursDuringCTFE!(_div64, int, 0));
+            /* And, when when the quotient overflows 64-bits. */
+            static assert(errorOccursDuringCTFE!(_udiv64, uint, 2));
+            static assert(errorOccursDuringCTFE!(_div64, int, 2));
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        void _mm_pause() @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {}
+            else
+            {
+                /* core.atomic.pause won't work for BetterC. */
+                version (LDC_Or_GNU)
+                {
+                    __builtin_ia32_pause();
+                }
+                else version (InlineAsm_X86_64_Or_X86)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        naked;
+                        pause;
+                        ret;
+                    }
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                _mm_pause();
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        version (GNU)
+        {
+            extern(C)
+            pragma(inline, true)
+            void __builtin_arm_dmb(uint Type) @safe pure nothrow @nogc
+            {
+                armBarrier!"dmb"(Type);
+            }
+
+            extern(C)
+            pragma(inline, true)
+            void __builtin_arm_dsb(uint Type) @safe pure nothrow @nogc
+            {
+                armBarrier!"dsb"(Type);
+            }
+
+            extern(C)
+            pragma(inline, true)
+            void __builtin_arm_isb(uint Type) @safe pure nothrow @nogc
+            {
+                armBarrier!"isb"(Type);
+            }
+
+            @safe pure nothrow @nogc unittest
+            {
+                static bool test(alias barrier)()
+                {
+                    barrier(0xF);
+                    barrier(0xE);
+                    barrier(0xB);
+                    barrier(0xA);
+                    barrier(0x7);
+                    barrier(0x6);
+                    barrier(0x3);
+                    barrier(0x2);
+
+                    try
+                    {
+                        barrier(0);
+                    }
+                    catch (AssertError)
+                    {
+                        return true;
+                    }
+
+                    assert(false);
+                }
+
+                assert(test!__builtin_arm_dmb());
+                static assert(test!__builtin_arm_dmb());
+                assert(test!__builtin_arm_dsb());
+                static assert(test!__builtin_arm_dsb());
+                assert(test!__builtin_arm_isb());
+                static assert(test!__builtin_arm_isb());
+            }
+
+            extern(C)
+            pragma(inline, true)
+            private void armBarrier(string barrier)(uint type) @safe pure nothrow @nogc
+            {
+                enum assertMessage = "Invalid Type supplied to __" ~ barrier ~ ".";
+
+                switch (type)
+                {
+                case 0xF:
+                    if (__ctfe)
+                    {}
+                    else
+                    {
+                        asm @trusted pure nothrow @nogc {"" ~ barrier ~ " sy" : : : "memory";} break;
+                    }
+                    break;
+                case 0xE:
+                    if (__ctfe)
+                    {}
+                    else
+                    {
+                        asm @trusted pure nothrow @nogc {"" ~ barrier ~ " st" : : : "memory";} break;
+                    }
+                    break;
+                case 0xB:
+                    if (__ctfe)
+                    {}
+                    else
+                    {
+                        asm @trusted pure nothrow @nogc {"" ~ barrier ~ " ish" : : : "memory";} break;
+                    }
+                    break;
+                case 0xA:
+                    if (__ctfe)
+                    {}
+                    else
+                    {
+                        asm @trusted pure nothrow @nogc {"" ~ barrier ~ " ishst" : : : "memory";} break;
+                    }
+                    break;
+                case 0x7:
+                    if (__ctfe)
+                    {}
+                    else
+                    {
+                        asm @trusted pure nothrow @nogc {"" ~ barrier ~ " nsh" : : : "memory";} break;
+                    }
+                    break;
+                case 0x6:
+                    if (__ctfe)
+                    {}
+                    else
+                    {
+                        asm @trusted pure nothrow @nogc {"" ~ barrier ~ " nshst" : : : "memory";} break;
+                    }
+                    break;
+                case 0x3:
+                    if (__ctfe)
+                    {}
+                    else
+                    {
+                        asm @trusted pure nothrow @nogc {"" ~ barrier ~ " osh" : : : "memory";} break;
+                    }
+                    break;
+                case 0x2:
+                    if (__ctfe)
+                    {}
+                    else
+                    {
+                        asm @trusted pure nothrow @nogc {"" ~ barrier ~ " oshst" : : : "memory";} break;
+                    }
+                    break;
+                default:
+                    assert(false, assertMessage);
+                }
+            }
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        void __cpuid(scope int[4]* cpuInfo, int function_id) @safe pure nothrow @nogc
+        {
+            cpuID(cpuInfo, function_id);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __cpuidex(scope int[4]* cpuInfo, int function_id, int subfunction_id) @safe pure nothrow @nogc
+        {
+            cpuID(cpuInfo, function_id, subfunction_id);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        private void cpuID(Args...)(scope int[4]* cpuInfo, int function_id, Args args) @safe pure nothrow @nogc
+        if (Args.length == 0 || (Args.length == 1 && is(Args[0] == int)))
+        {
+            version (LDC_Or_GNU)
+            {
+                asm @trusted pure nothrow @nogc
+                {
+                      "cpuid"
+                    : "=a" ((*cpuInfo)[0]), "=b" ((*cpuInfo)[1]), "=c" ((*cpuInfo)[2]), "=d" ((*cpuInfo)[3])
+                    : "0" (function_id), "2" (mixin(Args.length == 0 ? q{0} : q{args[0]}));
+                }
+            }
+            else version (InlineAsm_X86_64_Or_X86)
+            {
+                version (D_InlineAsm_X86_64)
+                {
+                    mixin(
+                        "asm @trusted pure nothrow @nogc
+                         {
+                             /* RCX is cpuInfo; EDX is function_id;
+                                R8D is subfunction_id (args[0]), if it's present. */
+                             naked;
+                             mov R9, RCX; /* Save the cpuInfo pointer before cpuid clobbers RCX. */
+                             mov EAX, EDX;
+                             " ~ (Args.length == 0 ? "xor ECX, ECX" : "mov ECX, R8D") ~ ";
+                             mov R10, RBX; /* RBX is non-volatile so we save it before cpuid clobbers it. */
+                             cpuid;
+                             mov [R9], EAX;
+                             mov [R9 +  4], EBX;
+                             mov [R9 +  8], ECX;
+                             mov [R9 + 12], EDX;
+                             mov RBX, R10;
+                             ret;
+                         }"
+                    );
+                }
+                else version (D_InlineAsm_X86)
+                {
+                    mixin(
+                        "asm @trusted pure nothrow @nogc
+                         {
+                             naked;
+                             push EBX; /* EBX is non-volatile so we save it before cpuid clobbers it. */
+                             push ESI; /* ESI is non-volatile so we save it before we clobber it. */
+                             mov EAX, [ESP + 16]; /* function_id. */
+                             mov ESI, [ESP + 12]; /* cpuInfo. */
+                            " ~ (Args.length == 0 ? "xor ECX, ECX" : "mov ECX, [ESP + 20] /* subfunction_id */") ~ ";
+                             cpuid;
+                             mov [ESI], EAX;
+                             mov [ESI +  4], EBX;
+                             mov [ESI +  8], ECX;
+                             mov [ESI + 12], EDX;
+                             pop ESI;
+                             pop EBX;
+                             ret;
+                         }"
+                    );
+                }
+            }
+            else
+            {
+                static assert(false);
+            }
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        @trusted pure nothrow @nogc unittest
+        {
+            import core.cpuid : vendor;
+
+            scope int[4] values = 0x18181818;
+
+            char[12] manufacturer()
+            {
+                typeof(return) characters;
+                characters[0 ..  4] = *cast(const(char)[4]*) &values[1];
+                characters[4 ..  8] = *cast(const(char)[4]*) &values[3];
+                characters[8 .. 12] = *cast(const(char)[4]*) &values[2];
+                return characters;
+            }
+
+            __cpuid(&values, 0);
+            assert(manufacturer == vendor);
+
+            values = 0x18181818;
+            __cpuidex(&values, 0, 0);
+            assert(manufacturer == vendor);
+
+            if (values[0] < 7)
+            {
+                return;
+            }
+
+            __cpuidex(&values, 7, 0);
+
+            if (values[0] < 1)
+            {
+                return;
+            }
+
+            /* Is the subfunction_id being used? Or, is __cpuidex mistakenly ignoring it? Let's test. */
+            scope oldValues = values;
+            __cpuidex(&values, 7, 1);
+            assert(values != oldValues);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        private enum float twoExp31Float = 2147483648.0f;
+        private enum float twoExp32Float = 4294967296.0f;
+        private enum float twoExp63Float = 9223372036854775808.0f;
+        private enum float twoExp64Float = 18446744073709551616.0f;
+        private enum double twoExp31Double = 2147483648.0;
+        private enum double twoExp32Double = 4294967296.0;
+        private enum double twoExp63Double = 9223372036854775808.0;
+        private enum double twoExp64Double = 18446744073709551616.0;
+        private enum float justUnderTwoExp63Float = 9223371487098961920.0f;
+        private enum double justUnderTwoExp63Double = 9223371487098961920.0f;
+
+        version (LDC_Or_GNU)
+        {}
+        else version (InlineAsm_X86_64_Or_X86)
+        {
+            private static immutable float twoExp31FloatInstance = twoExp31Float;
+            private static immutable float twoExp63FloatInstance = twoExp63Float;
+            private static immutable double twoExp31DoubleInstance = twoExp31Double;
+            private static immutable double twoExp63DoubleInstance = twoExp63Double;
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _cvt_ftoi_fast(float value) @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                if (value < twoExp31Float && value >= -twoExp31Float)
+                {
+                    return cast(int) value;
+                }
+
+                return 0x80000000;
+            }
+            else
+            {
+                version (LDC_Or_GNU)
+                {
+                    mixin(q{import }, gccBuiltins, q{ : __builtin_ia32_cvttss2si;});
+
+                    return __builtin_ia32_cvttss2si(value);
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        naked;
+                        cvttss2si EAX, XMM0;
+                        ret;
+                    }
+                }
+                else version (D_InlineAsm_X86)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        naked;
+                        cvttss2si EAX, [ESP + 4];
+                        ret;
+                    }
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_ftoi_fast(0.0f) == 0);
+                assert(_cvt_ftoi_fast(-0.0f) == 0);
+                assert(_cvt_ftoi_fast(float.nan) == 0x80000000);
+                assert(_cvt_ftoi_fast(-float.nan) == 0x80000000);
+                assert(_cvt_ftoi_fast(float.infinity) == 0x80000000);
+                assert(_cvt_ftoi_fast(-float.infinity) == 0x80000000);
+                assert(_cvt_ftoi_fast(1.0f) == 1);
+                assert(_cvt_ftoi_fast(-1.0f) == -1);
+                assert(_cvt_ftoi_fast(2.5f) == 2);
+                assert(_cvt_ftoi_fast(-2.5f) == -2);
+                assert(_cvt_ftoi_fast(3.5f) == 3);
+                assert(_cvt_ftoi_fast(-3.5f) == -3);
+                assert(_cvt_ftoi_fast(3.49f) == 3);
+                assert(_cvt_ftoi_fast(-3.49f) == -3);
+                assert(_cvt_ftoi_fast(twoExp31Float) == 0x80000000);
+                assert(_cvt_ftoi_fast(-twoExp31Float) == int.min);
+                assert(_cvt_ftoi_fast(twoExp63Float) == 0x80000000);
+                assert(_cvt_ftoi_fast(-twoExp63Float) == int.min);
+                assert(_cvt_ftoi_fast(justUnderTwoExp63Float) == int.min);
+                assert(_cvt_ftoi_fast(33554432.0f) == 33554432);
+                assert(_cvt_ftoi_fast(-33554432.0f) == -33554432);
+                assert(_cvt_ftoi_fast(33554436.0f) == 33554436);
+                assert(_cvt_ftoi_fast(-33554436.0f) == -33554436);
+                assert(_cvt_ftoi_fast(70369281048576.0f) == 0x80000000);
+                assert(_cvt_ftoi_fast(-70369281048576.0f) == 0x80000000);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        long _cvt_ftoll_fast(float value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                if (__ctfe)
+                {
+                    if (value < twoExp63Float && value >= -twoExp63Float)
+                    {
+                        return cast(long) value;
+                    }
+
+                    return 0x80000000_00000000;
+                }
+                else
+                {
+                    version (LDC_Or_GNU)
+                    {
+                        mixin(q{import }, gccBuiltins, q{ : __builtin_ia32_cvttss2si64;});
+
+                        return __builtin_ia32_cvttss2si64(value);
+                    }
+                    else version (D_InlineAsm_X86_64)
+                    {
+                        enum ubyte REX_W = 0b0100_1000;
+                        enum ubyte RAX_XMM0 = 0b11_000_000;
+
+                        asm @trusted pure nothrow @nogc
+                        {
+                            naked;
+                            /* DMD refuses to encode `cvttss2si RAX, XMM0`, so we'll encode it by hand. */
+                            db 0xF3, REX_W, 0x0F, 0x2C, RAX_XMM0; /* cvttss2si RAX, XMM0 */
+                            ret;
+                        }
+                    }
+                }
+            }
+            else version (X86)
+            {
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Float && value >= -twoExp31Float)
+                {
+                    return _cvt_ftoi_fast(value);
+                }
+
+                /* At this point, the exponent is at-least 31, and the value may be an infinity or NaN.
+                   We care about being correct for values with only an exponent less-than 63,
+                   which excludes infinities and NaNs, because that's how the MSVC intrinsic behaves.
+                   Because the exponent is at-least 23, the value will never actually contain any
+                   fractional digits, so we can just shift the significand left to get an integer. */
+
+                int asInt = *(cast(const(int)*) &value);
+
+                uint sign = asInt >> 31;
+                assert(sign == 0 || sign == -1);
+
+                bool isNaN = (asInt & 0b0_11111111_11111111111111111111111) > 0b0_11111111_00000000000000000000000;
+
+                if (isNaN)
+                {
+                    /* The MSVC intrinsic converts signalling NaNs to quiet NaNs, and this is observable
+                       in the returned value, so we do the same.  */
+                    asInt |= (1 << 22);
+                }
+
+                /* The exponent is biased by +127, but we subtract only 126 as we want the exponent
+                   to be one-higher than it actually is, so that we shift the correct number of bits
+                   after we mask the exponent by 31.
+                   E.g. with an exponent of 31 we should shift 0 bits, 32 should shift 1 bit, etc.. */
+                byte exponent = cast(byte) ((cast(ubyte) (asInt >>> 23)) - 126);
+                assert(exponent <= -127 || exponent >= 32);
+
+                /* We have 23-bits stored for the significand, and we know that the exponent is
+                   at-least 31, which means that we can shift left unconditionally by 8, which leaves
+                   the implicit bit of the full 24-bit significand to be set at the most-significant bit.
+                   Conveniently, this means that the variable shifting for the exponent concerns only
+                   the high half (remember that this is for 32-bit mode). */
+                uint unadjustedSignificand = (asInt << 8) | (1 << 31);
+
+                /* If the sign bit is set, we need to negate the significand; we can do that branchlessly
+                   by taking advantage of the fact that `sign` is either 0 or -1.
+                   As `(s ^ 0) - 0 == s`, whereas `(s ^ -1) - -1 == -s`. */
+                uint significand = (unadjustedSignificand ^ sign) - sign;
+                assert(sign == 0 ? significand == unadjustedSignificand : significand == -unadjustedSignificand);
+
+                uint highHalf = funnelShiftLeft(significand, sign, exponent & 31);
+
+                return (ulong(highHalf) << 32) | ulong(significand << (exponent & 31));
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_ftoll_fast(0.0f) == 0);
+                assert(_cvt_ftoll_fast(-0.0f) == 0);
+                assert(_cvt_ftoll_fast(1.0f) == 1);
+                assert(_cvt_ftoll_fast(-1.0f) == -1);
+                assert(_cvt_ftoll_fast(2.5f) == 2);
+                assert(_cvt_ftoll_fast(-2.5f) == -2);
+                assert(_cvt_ftoll_fast(3.5f) == 3);
+                assert(_cvt_ftoll_fast(-3.5f) == -3);
+                assert(_cvt_ftoll_fast(3.49f) == 3);
+                assert(_cvt_ftoll_fast(-3.49f) == -3);
+                assert(_cvt_ftoll_fast(twoExp31Float) == 2147483648);
+                assert(_cvt_ftoll_fast(-twoExp31Float) == -2147483648);
+                assert(_cvt_ftoll_fast(justUnderTwoExp63Float) == 9223371487098961920);
+                assert(_cvt_ftoll_fast(33554432.0f) == 33554432);
+                assert(_cvt_ftoll_fast(-33554432.0f) == -33554432);
+                assert(_cvt_ftoll_fast(33554436.0f) == 33554436);
+                assert(_cvt_ftoll_fast(-33554436.0f) == -33554436);
+                assert(_cvt_ftoll_fast(70369281048576.0f) == 70369281048576);
+                assert(_cvt_ftoll_fast(-70369281048576.0f) == -70369281048576);
+
+                version (X86_64)
+                {
+                    assert(_cvt_ftoll_fast(float.nan) == -9223372036854775808);
+                    assert(_cvt_ftoll_fast(-float.nan) == -9223372036854775808);
+                    assert(_cvt_ftoll_fast(float.infinity) == -9223372036854775808);
+                    assert(_cvt_ftoll_fast(-float.infinity) == -9223372036854775808);
+                    assert(_cvt_ftoll_fast(twoExp63Float) == -9223372036854775808);
+                    assert(_cvt_ftoll_fast(-twoExp63Float) == -9223372036854775808);
+                }
+                else version (X86)
+                {
+                    assert(_cvt_ftoll_fast(float.nan) == 6442450944);
+                    assert(_cvt_ftoll_fast(-float.nan) == -6442450944);
+                    assert(_cvt_ftoll_fast(float.infinity) == 4294967296);
+                    assert(_cvt_ftoll_fast(-float.infinity) == -4294967296);
+                    assert(_cvt_ftoll_fast(twoExp63Float) == 2147483648);
+                    assert(_cvt_ftoll_fast(-twoExp63Float) == -2147483648);
+                }
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        uint _cvt_ftoui_fast(float value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                return cast(uint) _cvt_ftoll_fast(value);
+            }
+            else version (X86)
+            {
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Float || value != value)
+                {
+                    return cast(uint) _cvt_ftoi_fast(value);
+                }
+
+                /* At this point, the exponent is at-least 31, and the value may be an infinity or NaN.
+                   We care about being correct for values with only an exponent of 31,
+                   which excludes infinities and NaNs, because that's how the MSVC intrinsic behaves.
+                   Because the exponent is at-least 23, the value will never actually contain any
+                   fractional digits, so we can just shift the significand left to get an integer. */
+
+                /* We have 23-bits stored for the significand, and we know that the exponent is
+                   at-least 31, and we only care about being correct for an exponent of 31,
+                   which means that we can just shift left unconditionally by 8, which leaves
+                   the implicit bit of the full 24-bit significand to be set at the most-significant bit. */
+                return (*(cast(const(uint)*) &value) << 8) | (1 << 31);
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_ftoui_fast(0.0f) == 0);
+                assert(_cvt_ftoui_fast(-0.0f) == 0);
+                assert(_cvt_ftoui_fast(1.0f) == 1);
+                assert(_cvt_ftoui_fast(-1.0f) == 4294967295);
+                assert(_cvt_ftoui_fast(2.5f) == 2);
+                assert(_cvt_ftoui_fast(-2.5f) == 4294967294);
+                assert(_cvt_ftoui_fast(3.5f) == 3);
+                assert(_cvt_ftoui_fast(-3.5f) == 4294967293);
+                assert(_cvt_ftoui_fast(3.49f) == 3);
+                assert(_cvt_ftoui_fast(-3.49f) == 4294967293);
+                assert(_cvt_ftoui_fast(twoExp31Float) == 2147483648);
+                assert(_cvt_ftoui_fast(-twoExp31Float) == 2147483648);
+                assert(_cvt_ftoui_fast(33554432.0f) == 33554432);
+                assert(_cvt_ftoui_fast(-33554432.0f) == 4261412864);
+                assert(_cvt_ftoui_fast(33554436.0f) == 33554436);
+                assert(_cvt_ftoui_fast(-33554436.0f) == 4261412860);
+
+                version (X86_64)
+                {
+                    assert(_cvt_ftoui_fast(twoExp63Float) == 0);
+                    assert(_cvt_ftoui_fast(-twoExp63Float) == 0);
+                    assert(_cvt_ftoui_fast(justUnderTwoExp63Float) == 0);
+                    assert(_cvt_ftoui_fast(float.nan) == 0);
+                    assert(_cvt_ftoui_fast(-float.nan) == 0);
+                    assert(_cvt_ftoui_fast(float.infinity) == 0);
+                    assert(_cvt_ftoui_fast(-float.infinity) == 0);
+                    assert(_cvt_ftoui_fast(70369281048576.0f) == 536870912);
+                    assert(_cvt_ftoui_fast(-70369281048576.0f) == 3758096384);
+                }
+                else version (X86)
+                {
+                    assert(_cvt_ftoui_fast(twoExp63Float) == 2147483648);
+                    assert(_cvt_ftoui_fast(-twoExp63Float) == 2147483648);
+                    assert(_cvt_ftoui_fast(justUnderTwoExp63Float) == 4294967040);
+                    assert(_cvt_ftoui_fast(float.nan) == 2147483648);
+                    assert(_cvt_ftoui_fast(-float.nan) == 2147483648);
+                    assert(_cvt_ftoui_fast(float.infinity) == 2147483648);
+                    assert(_cvt_ftoui_fast(-float.infinity) == 2147483648);
+                    assert(_cvt_ftoui_fast(70369281048576.0f) == 2147500032);
+                    assert(_cvt_ftoui_fast(-70369281048576.0f) == 2147483648);
+                }
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        ulong _cvt_ftoull_fast(float value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp63Float || value != value)
+                {
+                    return cast(ulong) _cvt_ftoll_fast(value);
+                }
+
+                /* At this point, the exponent is at-least 63, and the value may be an infinity or NaN.
+                   We care about being correct for values with only an exponent of 63,
+                   which excludes infinities and NaNs, because that's how the MSVC intrinsic behaves.
+                   Because the exponent is at-least 23, the value will never actually contain any
+                   fractional digits, so we can just shift the significand left to get an integer. */
+
+                /* We have 23-bits stored for the significand, and we know that the exponent is
+                   at-least 63, and we only care about being correct for an exponent of 63,
+                   which means that we can just shift left unconditionally by 40, which leaves
+                   the implicit bit of the full 24-bit significand to be set at the most-significant bit. */
+                return (ulong(*(cast(const(uint)*) &value)) << 40) | (ulong(1) << 63);
+            }
+            else version (X86)
+            {
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Float || value != value)
+                {
+                    return cast(ulong) cast(uint) _cvt_ftoi_fast(value);
+                }
+
+                /* At this point, the exponent is at-least 31, and the value may be an infinity or NaN.
+                   We care about being correct for values with only an exponent less-than 64,
+                   which excludes infinities and NaNs, because that's how the MSVC intrinsic behaves.
+                   Because the exponent is at-least 23, the value will never actually contain any
+                   fractional digits, so we can just shift the significand left to get an integer. */
+
+                int asInt = *(cast(const(int)*) &value);
+
+                /* The exponent is biased by +127, but we subtract only 126 as we want the exponent
+                   to be one-higher than it actually is, so that we shift the correct number of bits
+                   after we mask the exponent by 31.
+                   E.g. with an exponent of 31 we should shift 0 bits, 32 should shift 1 bit, etc.. */
+                byte exponent = cast(byte) ((cast(ubyte) (asInt >>> 23)) - 126);
+                assert(exponent <= -127 || exponent >= 32);
+
+                /* We have 23-bits stored for the significand, and we know that the exponent is
+                   at-least 31, which means that we can shift left unconditionally by 8, which leaves
+                   the implicit bit of the full 24-bit significand to be set at the most-significant bit.
+                   Conveniently, this means that the variable shifting for the exponent concerns only
+                   the high half (remember that this is for 32-bit mode). */
+                uint significand = (asInt << 8) | (1 << 31);
+
+                return ulong(significand) << (exponent == 64 ? 32 : (exponent & 31));
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_ftoull_fast(0.0f) == 0);
+                assert(_cvt_ftoull_fast(-0.0f) == 0);
+                assert(_cvt_ftoull_fast(1.0f) == 1);
+                assert(_cvt_ftoull_fast(2.5f) == 2);
+                assert(_cvt_ftoull_fast(3.5f) == 3);
+                assert(_cvt_ftoull_fast(3.49f) == 3);
+                assert(_cvt_ftoull_fast(twoExp31Float) == 2147483648);
+                assert(_cvt_ftoull_fast(twoExp63Float) == 9223372036854775808);
+                assert(_cvt_ftoull_fast(justUnderTwoExp63Float) == 9223371487098961920);
+                assert(_cvt_ftoull_fast(33554432.0f) == 33554432);
+                assert(_cvt_ftoull_fast(33554436.0f) == 33554436);
+                assert(_cvt_ftoull_fast(70369281048576.0f) == 70369281048576);
+
+                version (X86_64)
+                {
+                    assert(_cvt_ftoull_fast(-1.0f) == 18446744073709551615);
+                    assert(_cvt_ftoull_fast(-2.5f) == 18446744073709551614);
+                    assert(_cvt_ftoull_fast(-3.5f) == 18446744073709551613);
+                    assert(_cvt_ftoull_fast(-3.49f) == 18446744073709551613);
+                    assert(_cvt_ftoull_fast(-twoExp31Float) == 18446744071562067968);
+                    assert(_cvt_ftoull_fast(-twoExp63Float) == 9223372036854775808);
+                    assert(_cvt_ftoull_fast(float.nan) == 9223372036854775808);
+                    assert(_cvt_ftoull_fast(-float.nan) == 9223372036854775808);
+                    assert(_cvt_ftoull_fast(float.infinity) == 9223372036854775808);
+                    assert(_cvt_ftoull_fast(-float.infinity) == 9223372036854775808);
+                    assert(_cvt_ftoull_fast(-33554432.0f) == 18446744073675997184);
+                    assert(_cvt_ftoull_fast(-33554436.0f) == 18446744073675997180);
+                    assert(_cvt_ftoull_fast(-70369281048576.0f) == 18446673704428503040);
+                }
+                else version (X86)
+                {
+                    assert(_cvt_ftoull_fast(-1.0f) == 4294967295);
+                    assert(_cvt_ftoull_fast(-2.5f) == 4294967294);
+                    assert(_cvt_ftoull_fast(-3.5f) == 4294967293);
+                    assert(_cvt_ftoull_fast(-3.49f) == 4294967293);
+                    assert(_cvt_ftoull_fast(-twoExp31Float) == 2147483648);
+                    assert(_cvt_ftoull_fast(-twoExp63Float) == 2147483648);
+                    assert(_cvt_ftoull_fast(float.nan) == 2147483648);
+                    assert(_cvt_ftoull_fast(-float.nan) == 2147483648);
+                    assert(_cvt_ftoull_fast(float.infinity) == 4294967296);
+                    assert(_cvt_ftoull_fast(-float.infinity) == 2147483648);
+                    assert(_cvt_ftoull_fast(-33554432.0f) == 4261412864);
+                    assert(_cvt_ftoull_fast(-33554436.0f) == 4261412860);
+                    assert(_cvt_ftoull_fast(-70369281048576.0f) == 2147483648);
+                }
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _cvt_dtoi_fast(double value) @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                if (value < twoExp31Double && value >= -twoExp31Double)
+                {
+                    return cast(int) value;
+                }
+
+                return 0x80000000;
+            }
+            else
+            {
+                version (LDC_Or_GNU)
+                {
+                    mixin(q{import }, gccBuiltins, q{ : __builtin_ia32_cvttsd2si;});
+
+                    return __builtin_ia32_cvttsd2si(value);
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        naked;
+                        cvttsd2si EAX, XMM0;
+                        ret;
+                    }
+                }
+                else version (D_InlineAsm_X86)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        naked;
+                        cvttsd2si EAX, [ESP + 4];
+                        ret;
+                    }
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_dtoi_fast(0.0) == 0);
+                assert(_cvt_dtoi_fast(-0.0) == 0);
+                assert(_cvt_dtoi_fast(float.nan) == -2147483648);
+                assert(_cvt_dtoi_fast(-float.nan) == -2147483648);
+                assert(_cvt_dtoi_fast(float.infinity) == -2147483648);
+                assert(_cvt_dtoi_fast(-float.infinity) == -2147483648);
+                assert(_cvt_dtoi_fast(1.0) == 1);
+                assert(_cvt_dtoi_fast(-1.0) == -1);
+                assert(_cvt_dtoi_fast(2.5) == 2);
+                assert(_cvt_dtoi_fast(-2.5) == -2);
+                assert(_cvt_dtoi_fast(3.5) == 3);
+                assert(_cvt_dtoi_fast(-3.5) == -3);
+                assert(_cvt_dtoi_fast(3.49) == 3);
+                assert(_cvt_dtoi_fast(-3.49) == -3);
+                assert(_cvt_dtoi_fast(twoExp31Float) == -2147483648);
+                assert(_cvt_dtoi_fast(-twoExp31Float) == -2147483648);
+                assert(_cvt_dtoi_fast(twoExp63Float) == -2147483648);
+                assert(_cvt_dtoi_fast(-twoExp63Float) == -2147483648);
+                assert(_cvt_dtoi_fast(justUnderTwoExp63Float) == -2147483648);
+                assert(_cvt_dtoi_fast(33554432.0) == 33554432);
+                assert(_cvt_dtoi_fast(-33554432.0) == -33554432);
+                assert(_cvt_dtoi_fast(33554436.0) == 33554436);
+                assert(_cvt_dtoi_fast(-33554436.0) == -33554436);
+                assert(_cvt_dtoi_fast(70369281048576.0) == -2147483648);
+                assert(_cvt_dtoi_fast(-70369281048576.0) == -2147483648);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        long _cvt_dtoll_fast(double value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                if (__ctfe)
+                {
+                    if (value < twoExp63Double && value >= -twoExp63Double)
+                    {
+                        return cast(long) value;
+                    }
+
+                    return 0x80000000_00000000;
+                }
+                else
+                {
+                    version (LDC_Or_GNU)
+                    {
+                        mixin(q{import }, gccBuiltins, q{ : __builtin_ia32_cvttsd2si64;});
+
+                        return __builtin_ia32_cvttsd2si64(value);
+                    }
+                    else version (D_InlineAsm_X86_64)
+                    {
+                        enum ubyte REX_W = 0b0100_1000;
+                        enum ubyte RAX_XMM0 = 0b11_000_000;
+
+                        asm @trusted pure nothrow @nogc
+                        {
+                            naked;
+                            /* DMD refuses to encode `cvttsd2si RAX, XMM0`, so we'll encode it by hand. */
+                            db 0xF2, REX_W, 0x0F, 0x2C, RAX_XMM0; /* cvttsd2si RAX, XMM0 */
+                            ret;
+                        }
+                    }
+                }
+            }
+            else version (X86)
+            {
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Double && value >= -twoExp31Double)
+                {
+                    return _cvt_dtoi_fast(value);
+                }
+
+                /* At this point, the exponent is at-least 31, and the value may be an infinity or NaN.
+                   We care about being correct for values with only an exponent less-than 63,
+                   which excludes infinities and NaNs, because that's how the MSVC intrinsic behaves. */
+
+                long asInt = *(cast(const(long)*) &value);
+
+                uint high = cast(uint) (asInt >>> 32);
+                uint low = cast(uint) asInt;
+
+                long sign = (cast(int) high) >> 31;
+                assert(sign == 0 || sign == -1);
+
+                int exponent = ((high >>> 20) & 2047) - 1023;
+                /* NaNs and infinity exponents will result in 1024, whereas numeric exponents will be at-least 31. */
+                assert(exponent >= 31);
+
+                /* When the value is an infinity or NaN, the MSVC intrinsic always negates the significand. */
+                if (exponent == 1024)
+                {
+                    sign = -1;
+                }
+
+                ulong significand = (ulong((high & 0b00000000_00001111_11111111_11111111) | (1 << 20)) << 32) | low;
+                uint shiftCount = (exponent < 52 ? 52 : exponent) - (exponent < 52 ? exponent : 52);
+
+                if (exponent < 52)
+                {
+                    significand >>>= (shiftCount & 31);
+                }
+                else
+                {
+                    significand <<= (shiftCount & 31);
+                }
+
+                /* If the sign bit is set, we need to negate the significand; we can do that branchlessly
+                   by taking advantage of the fact that `sign` is either 0 or -1.
+                   As `(s ^ 0) - 0 == s`, whereas `(s ^ -1) - -1 == -s`. */
+                ulong adjustedSignificand = (significand ^ sign) - sign;
+                assert(sign == 0 ? adjustedSignificand == significand : adjustedSignificand == -significand);
+
+                return adjustedSignificand;
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_dtoll_fast(0.0) == 0);
+                assert(_cvt_dtoll_fast(-0.0) == 0);
+                assert(_cvt_dtoll_fast(float.nan) == -9223372036854775808);
+                assert(_cvt_dtoll_fast(-float.nan) == -9223372036854775808);
+                assert(_cvt_dtoll_fast(1.0) == 1);
+                assert(_cvt_dtoll_fast(-1.0) == -1);
+                assert(_cvt_dtoll_fast(2.5) == 2);
+                assert(_cvt_dtoll_fast(-2.5) == -2);
+                assert(_cvt_dtoll_fast(3.5) == 3);
+                assert(_cvt_dtoll_fast(-3.5) == -3);
+                assert(_cvt_dtoll_fast(3.49) == 3);
+                assert(_cvt_dtoll_fast(-3.49) == -3);
+                assert(_cvt_dtoll_fast(twoExp31Float) == 2147483648);
+                assert(_cvt_dtoll_fast(-twoExp31Float) == -2147483648);
+                assert(_cvt_dtoll_fast(twoExp63Float) == -9223372036854775808);
+                assert(_cvt_dtoll_fast(-twoExp63Float) == -9223372036854775808);
+                assert(_cvt_dtoll_fast(justUnderTwoExp63Float) == 9223371487098961920);
+                assert(_cvt_dtoll_fast(33554432.0) == 33554432);
+                assert(_cvt_dtoll_fast(-33554432.0) == -33554432);
+                assert(_cvt_dtoll_fast(33554436.0) == 33554436);
+                assert(_cvt_dtoll_fast(-33554436.0) == -33554436);
+                assert(_cvt_dtoll_fast(70369281048576.0) == 70369281048576);
+                assert(_cvt_dtoll_fast(-70369281048576.0) == -70369281048576);
+
+                version (X86_64)
+                {
+                    assert(_cvt_dtoll_fast(float.infinity) == -9223372036854775808);
+                    assert(_cvt_dtoll_fast(-float.infinity) == -9223372036854775808);
+                }
+                else version (X86)
+                {
+                    assert(_cvt_dtoll_fast(float.infinity) == 0);
+                    assert(_cvt_dtoll_fast(-float.infinity) == 0);
+                }
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        uint _cvt_dtoui_fast(double value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                return cast(uint) _cvt_dtoll_fast(value);
+            }
+            else version (X86)
+            {
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Double || value != value)
+                {
+                    return cast(uint) _cvt_dtoi_fast(value);
+                }
+
+                /* At this point, the exponent is at-least 31, and the value may be an infinity or NaN.
+                   We care about being correct for values with only an exponent of 31,
+                   which excludes infinities and NaNs, because that's how the MSVC intrinsic behaves. */
+
+                /* We have 52-bits stored for the significand, and we know that the exponent is
+                   at-least 31, and we only care about being correct for an exponent of 31,
+                   which means that we can just shift left unconditionally by 21 (52 - 31), which leaves
+                   the implicit bit of the full 53-bit significand to be set at the most-significant bit. */
+                return cast(uint) (*(cast(const(ulong)*) &value) >>> 21) | (1 << 31);
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_dtoui_fast(0.0) == 0);
+                assert(_cvt_dtoui_fast(-0.0) == 0);
+                assert(_cvt_dtoui_fast(1.0) == 1);
+                assert(_cvt_dtoui_fast(-1.0) == 4294967295);
+                assert(_cvt_dtoui_fast(2.5) == 2);
+                assert(_cvt_dtoui_fast(-2.5) == 4294967294);
+                assert(_cvt_dtoui_fast(3.5) == 3);
+                assert(_cvt_dtoui_fast(-3.5) == 4294967293);
+                assert(_cvt_dtoui_fast(3.49) == 3);
+                assert(_cvt_dtoui_fast(-3.49) == 4294967293);
+                assert(_cvt_dtoui_fast(twoExp31Float) == 2147483648);
+                assert(_cvt_dtoui_fast(-twoExp31Float) == 2147483648);
+                assert(_cvt_dtoui_fast(33554432.0) == 33554432);
+                assert(_cvt_dtoui_fast(-33554432.0) == 4261412864);
+                assert(_cvt_dtoui_fast(33554436.0) == 33554436);
+                assert(_cvt_dtoui_fast(-33554436.0) == 4261412860);
+
+                version (X86_64)
+                {
+                    assert(_cvt_dtoui_fast(float.nan) == 0);
+                    assert(_cvt_dtoui_fast(-float.nan) == 0);
+                    assert(_cvt_dtoui_fast(float.infinity) == 0);
+                    assert(_cvt_dtoui_fast(-float.infinity) == 0);
+                    assert(_cvt_dtoui_fast(twoExp63Float) == 0);
+                    assert(_cvt_dtoui_fast(-twoExp63Float) == 0);
+                    assert(_cvt_dtoui_fast(justUnderTwoExp63Float) == 0);
+                    assert(_cvt_dtoui_fast(70369281048576.0) == 536870912);
+                    assert(_cvt_dtoui_fast(-70369281048576.0) == 3758096384);
+                }
+                else version (X86)
+                {
+                    assert(_cvt_dtoui_fast(float.nan) == 2147483648);
+                    assert(_cvt_dtoui_fast(-float.nan) == 2147483648);
+                    assert(_cvt_dtoui_fast(float.infinity) == 2147483648);
+                    assert(_cvt_dtoui_fast(-float.infinity) == 2147483648);
+                    assert(_cvt_dtoui_fast(twoExp63Float) == 2147483648);
+                    assert(_cvt_dtoui_fast(-twoExp63Float) == 2147483648);
+                    assert(_cvt_dtoui_fast(justUnderTwoExp63Float) == 4294967040);
+                    assert(_cvt_dtoui_fast(70369281048576.0) == 2147500032);
+                    assert(_cvt_dtoui_fast(-70369281048576.0) == 2147483648);
+                }
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        ulong _cvt_dtoull_fast(double value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp63Double || value != value)
+                {
+                    return cast(ulong) _cvt_dtoll_fast(value);
+                }
+
+                /* At this point, the exponent is at-least 63, and the value may be an infinity or NaN.
+                   We care about being correct for values with only an exponent of 63,
+                   which excludes infinities and NaNs, because that's how the MSVC intrinsic behaves. */
+
+                /* We have 52-bits stored for the significand, and we know that the exponent is
+                   at-least 63, and we only care about being correct for an exponent of 63,
+                   which means that we can just shift left unconditionally by 11 (63 - 52), which leaves
+                   the implicit bit of the full 53-bit significand to be set at the most-significant bit. */
+                return (*(cast(const(ulong)*) &value) << 11) | (ulong(1) << 63);
+            }
+            else version (X86)
+            {
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Double || value != value)
+                {
+                    return cast(uint) _cvt_dtoi_fast(value);
+                }
+
+                /* At this point, the exponent is at-least 31, and the value may be an infinity or NaN.
+                   We care about being correct for values with only an exponent less-than 64,
+                   which excludes infinities and NaNs, because that's how the MSVC intrinsic behaves. */
+
+                long asInt = *(cast(const(long)*) &value);
+
+                uint high = cast(uint) (asInt >>> 32);
+                uint low = cast(uint) asInt;
+
+                int exponent = ((high >>> 20) & 2047) - 1023;
+                /* NaNs and infinity exponents will result in 1024, whereas numeric exponents will be at-least 31. */
+                assert(exponent >= 31);
+
+                ulong significand = (ulong((high & 0b00000000_00001111_11111111_11111111) | (1 << 20)) << 32) | low;
+                uint shiftCount = (exponent < 52 ? 52 : exponent) - (exponent < 52 ? exponent : 52);
+
+                if (exponent < 52)
+                {
+                    significand >>>= (shiftCount & 31);
+                }
+                else
+                {
+                    significand <<= (shiftCount & 31);
+                }
+
+                return significand;
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_dtoull_fast(0.0) == 0);
+                assert(_cvt_dtoull_fast(-0.0) == 0);
+                assert(_cvt_dtoull_fast(1.0) == 1);
+                assert(_cvt_dtoull_fast(2.5) == 2);
+                assert(_cvt_dtoull_fast(3.5) == 3);
+                assert(_cvt_dtoull_fast(3.49) == 3);
+                assert(_cvt_dtoull_fast(twoExp31Float) == 2147483648);
+                assert(_cvt_dtoull_fast(twoExp63Float) == 9223372036854775808);
+                assert(_cvt_dtoull_fast(justUnderTwoExp63Float) == 9223371487098961920);
+                assert(_cvt_dtoull_fast(33554432.0) == 33554432);
+                assert(_cvt_dtoull_fast(33554436.0) == 33554436);
+                assert(_cvt_dtoull_fast(70369281048576.0) == 70369281048576);
+
+                version (X86_64)
+                {
+                    assert(_cvt_dtoull_fast(float.nan) == 9223372036854775808);
+                    assert(_cvt_dtoull_fast(-float.nan) == 9223372036854775808);
+                    assert(_cvt_dtoull_fast(float.infinity) == 9223372036854775808);
+                    assert(_cvt_dtoull_fast(-float.infinity) == 9223372036854775808);
+                    assert(_cvt_dtoull_fast(-1.0) == 18446744073709551615);
+                    assert(_cvt_dtoull_fast(-2.5) == 18446744073709551614);
+                    assert(_cvt_dtoull_fast(-3.5) == 18446744073709551613);
+                    assert(_cvt_dtoull_fast(-3.49) == 18446744073709551613);
+                    assert(_cvt_dtoull_fast(-twoExp31Float) == 18446744071562067968);
+                    assert(_cvt_dtoull_fast(-twoExp63Float) == 9223372036854775808);
+                    assert(_cvt_dtoull_fast(-33554432.0) == 18446744073675997184);
+                    assert(_cvt_dtoull_fast(-33554436.0) == 18446744073675997180);
+                    assert(_cvt_dtoull_fast(-70369281048576.0) == 18446673704428503040);
+                }
+                else version (X86)
+                {
+                    assert(_cvt_dtoull_fast(float.nan) == 2147483648);
+                    assert(_cvt_dtoull_fast(-float.nan) == 2147483648);
+                    assert(_cvt_dtoull_fast(float.infinity) == 0);
+                    assert(_cvt_dtoull_fast(-float.infinity) == 2147483648);
+                    assert(_cvt_dtoull_fast(-1.0) == 4294967295);
+                    assert(_cvt_dtoull_fast(-2.5) == 4294967294);
+                    assert(_cvt_dtoull_fast(-3.5) == 4294967293);
+                    assert(_cvt_dtoull_fast(-3.49) == 4294967293);
+                    assert(_cvt_dtoull_fast(-twoExp31Float) == 2147483648);
+                    assert(_cvt_dtoull_fast(-twoExp63Float) == 2147483648);
+                    assert(_cvt_dtoull_fast(-33554432.0) == 4261412864);
+                    assert(_cvt_dtoull_fast(-33554436.0) == 4261412860);
+                    assert(_cvt_dtoull_fast(-70369281048576.0) == 2147483648);
+                }
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _cvt_ftoi_sat(float value) @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                if (value >= twoExp31Float)
+                {
+                    return int.max;
+                }
+
+                if (value < -twoExp31Float)
+                {
+                    return int.min;
+                }
+
+                if (value != value)
+                {
+                    return 0;
+                }
+
+                return cast(int) value;
+            }
+            else
+            {
+                version (LDC_Or_GNU)
+                {
+                    mixin(q{import }, gccBuiltins, q{ : __builtin_ia32_cvttss2si;});
+
+                    if (value >= twoExp31Float)
+                    {
+                        return int.max;
+                    }
+
+                    if (value != value)
+                    {
+                        return 0;
+                    }
+
+                    /* If value is less-than -twoExp31Float cvttss2si will evaluate to int.min. */
+                    return __builtin_ia32_cvttss2si(value);
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        naked;
+                        ucomiss XMM0, twoExp31FloatInstance;
+                        mov EAX, int.max;
+                        jae tooBig; /* Jump if value is greater-or-equal to twoExp31Float. */
+                        jp isNaN; /* Jump if value is NaN. */
+                        /* If value is less-than -twoExp31Float cvttss2si will evaluate to int.min. */
+                        cvttss2si EAX, XMM0;
+                        ret;
+                    isNaN:
+                        xor EAX, EAX;
+                    tooBig:
+                        ret;
+                    }
+                }
+                else version (D_InlineAsm_X86)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        naked;
+                        movss XMM0, [ESP + 4];
+                        ucomiss XMM0, twoExp31FloatInstance;
+                        mov EAX, int.max;
+                        jae tooBig; /* Jump if value is greater-or-equal to twoExp31Float. */
+                        jp isNaN; /* Jump if value is NaN. */
+                        /* If value is less-than -twoExp31Float cvttss2si will evaluate to int.min. */
+                        cvttss2si EAX, XMM0;
+                        ret;
+                    isNaN:
+                        xor EAX, EAX;
+                    tooBig:
+                        ret;
+                    }
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_ftoi_sat(0.0f) == 0);
+                assert(_cvt_ftoi_sat(-0.0f) == 0);
+                assert(_cvt_ftoi_sat(float.nan) == 0);
+                assert(_cvt_ftoi_sat(-float.nan) == 0);
+                assert(_cvt_ftoi_sat(float.infinity) == 2147483647);
+                assert(_cvt_ftoi_sat(-float.infinity) == -2147483648);
+                assert(_cvt_ftoi_sat(1.0f) == 1);
+                assert(_cvt_ftoi_sat(-1.0f) == -1);
+                assert(_cvt_ftoi_sat(2.5f) == 2);
+                assert(_cvt_ftoi_sat(-2.5f) == -2);
+                assert(_cvt_ftoi_sat(3.5f) == 3);
+                assert(_cvt_ftoi_sat(-3.5f) == -3);
+                assert(_cvt_ftoi_sat(3.49f) == 3);
+                assert(_cvt_ftoi_sat(-3.49f) == -3);
+                assert(_cvt_ftoi_sat(twoExp31Float) == 2147483647);
+                assert(_cvt_ftoi_sat(-twoExp31Float) == -2147483648);
+                assert(_cvt_ftoi_sat(twoExp63Float) == 2147483647);
+                assert(_cvt_ftoi_sat(-twoExp63Float) == -2147483648);
+                assert(_cvt_ftoi_sat(justUnderTwoExp63Float) == 2147483647);
+                assert(_cvt_ftoi_sat(33554432.0f) == 33554432);
+                assert(_cvt_ftoi_sat(-33554432.0f) == -33554432);
+                assert(_cvt_ftoi_sat(33554436.0f) == 33554436);
+                assert(_cvt_ftoi_sat(-33554436.0f) == -33554436);
+                assert(_cvt_ftoi_sat(70369281048576.0f) == 2147483647);
+                assert(_cvt_ftoi_sat(-70369281048576.0f) == -2147483648);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        long _cvt_ftoll_sat(float value) @trusted pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                if (value >= twoExp63Float)
+                {
+                    return long.max;
+                }
+
+                if (value < -twoExp63Float)
+                {
+                    return long.min;
+                }
+
+                if (value != value)
+                {
+                    return 0;
+                }
+
+                return cast(long) value;
+            }
+            else
+            {
+                version (X86_64)
+                {
+                    version (LDC_Or_GNU)
+                    {
+                        mixin(q{import }, gccBuiltins, q{ : __builtin_ia32_cvttss2si64;});
+
+                        if (value >= twoExp63Float)
+                        {
+                            return long.max;
+                        }
+
+                        if (value != value)
+                        {
+                            return 0;
+                        }
+
+                        /* If value is less-than -twoExp63Float cvttss2si will evaluate to long.min. */
+                        return __builtin_ia32_cvttss2si64(value);
+                    }
+                    else version (D_InlineAsm_X86_64)
+                    {
+                        enum ubyte REX_W = 0b0100_1000;
+                        enum ubyte RAX_XMM0 = 0b11_000_000;
+
+                        asm @trusted pure nothrow @nogc
+                        {
+                            naked;
+                            ucomiss XMM0, twoExp63FloatInstance;
+                            mov RAX, long.max;
+                            jae tooBig; /* Jump if value is greater-or-equal to twoExp63Float. */
+                            jp isNaN; /* Jump if value is NaN. */
+                            /* If value is less-than -twoExp63Float cvttss2si will evaluate to long.min. */
+                            /* DMD refuses to encode `cvttss2si RAX, XMM0`, so we'll encode it by hand. */
+                            db 0xF3, REX_W, 0x0F, 0x2C, RAX_XMM0; /* cvttss2si RAX, XMM0 */
+                            ret;
+                        isNaN:
+                            xor EAX, EAX;
+                        tooBig:
+                            ret;
+                        }
+                    }
+                }
+                else version (X86)
+                {
+                    import std.math : nextUp;
+
+                    /* If the hardware can handle it, let it handle it. */
+                    if (value < twoExp31Float && value >= -twoExp31Float)
+                    {
+                        return _cvt_ftoi_fast(value);
+                    }
+
+                    if (value >= twoExp63Float)
+                    {
+                        return long.max;
+                    }
+
+                    if (value <= -twoExp63Float)
+                    {
+                        return long.min;
+                    }
+
+                    if (value != value)
+                    {
+                        return 0;
+                    }
+
+                    /* At this point, the exponent is at-least 31 and less-than 64.
+                       Because the exponent is at-least 23, the value will never actually contain any
+                       fractional digits, so we can just shift the significand left to get an integer. */
+
+                    int asInt = *(cast(const(int)*) &value);
+
+                    uint sign = asInt >> 31;
+                    assert(sign == 0 || sign == -1);
+
+                    /* The exponent is biased by +127, but we subtract only 126 as we want the exponent
+                       to be one-higher than it actually is, so that we shift the correct number of bits
+                       after we mask the exponent by 31.
+                       E.g. with an exponent of 31 we should shift 0 bits, 32 should shift 1 bit, etc.. */
+                    byte exponent = cast(byte) ((cast(ubyte) (asInt >>> 23)) - 126);
+                    assert(exponent >= 32);
+                    assert(exponent <= 63);
+
+                    /* We have 23-bits stored for the significand, and we know that the exponent is
+                       at-least 31, which means that we can shift left unconditionally by 8, which leaves
+                       the implicit bit of the full 24-bit significand to be set at the most-significant bit.
+                       Conveniently, this means that the variable shifting for the exponent concerns only
+                       the high half (remember that this is for 32-bit mode). */
+                    uint unadjustedSignificand = (asInt << 8) | (1 << 31);
+
+                    /* If the sign bit is set, we need to negate the significand; we can do that branchlessly
+                       by taking advantage of the fact that `sign` is either 0 or -1.
+                       As `(s ^ 0) - 0 == s`, whereas `(s ^ -1) - -1 == -s`. */
+                    uint significand = (unadjustedSignificand ^ sign) - sign;
+                    assert(sign == 0 ? significand == unadjustedSignificand : significand == -unadjustedSignificand);
+
+                    uint highHalf = funnelShiftLeft(significand, sign, exponent & 31);
+
+                    return (ulong(highHalf) << 32) | ulong(significand << (exponent & 31));
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_ftoll_sat(0.0f) == 0);
+                assert(_cvt_ftoll_sat(-0.0f) == 0);
+                assert(_cvt_ftoll_sat(float.nan) == 0);
+                assert(_cvt_ftoll_sat(-float.nan) == 0);
+                assert(_cvt_ftoll_sat(float.infinity) == 9223372036854775807);
+                assert(_cvt_ftoll_sat(-float.infinity) == -9223372036854775808);
+                assert(_cvt_ftoll_sat(1.0f) == 1);
+                assert(_cvt_ftoll_sat(-1.0f) == -1);
+                assert(_cvt_ftoll_sat(2.5f) == 2);
+                assert(_cvt_ftoll_sat(-2.5f) == -2);
+                assert(_cvt_ftoll_sat(3.5f) == 3);
+                assert(_cvt_ftoll_sat(-3.5f) == -3);
+                assert(_cvt_ftoll_sat(3.49f) == 3);
+                assert(_cvt_ftoll_sat(-3.49f) == -3);
+                assert(_cvt_ftoll_sat(twoExp31Float) == 2147483648);
+                assert(_cvt_ftoll_sat(-twoExp31Float) == -2147483648);
+                assert(_cvt_ftoll_sat(twoExp63Float) == 9223372036854775807);
+                assert(_cvt_ftoll_sat(-twoExp63Float) == -9223372036854775808);
+                assert(_cvt_ftoll_sat(justUnderTwoExp63Float) == 9223371487098961920);
+                assert(_cvt_ftoll_sat(33554432.0f) == 33554432);
+                assert(_cvt_ftoll_sat(-33554432.0f) == -33554432);
+                assert(_cvt_ftoll_sat(33554436.0f) == 33554436);
+                assert(_cvt_ftoll_sat(-33554436.0f) == -33554436);
+                assert(_cvt_ftoll_sat(70369281048576.0f) == 70369281048576);
+                assert(_cvt_ftoll_sat(-70369281048576.0f) == -70369281048576);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        uint _cvt_ftoui_sat(float value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                if (value >= twoExp32Float)
+                {
+                    return uint.max;
+                }
+
+                if (value < 0.0f || value != value)
+                {
+                    return 0;
+                }
+
+                return cast(uint) _cvt_ftoll_fast(value);
+            }
+            else version (X86)
+            {
+                if (value < 0.0f || value != value)
+                {
+                    return 0;
+                }
+
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Float)
+                {
+                    return cast(uint) _cvt_ftoi_fast(value);
+                }
+
+                if (value >= twoExp32Float)
+                {
+                    return uint.max;
+                }
+
+                /* At this point, the exponent is 31.
+                   Because the exponent is at-least 23, the value will never actually contain any
+                   fractional digits, so we can just shift the significand left to get an integer. */
+
+                /* We have 23-bits stored for the significand, and we know that the exponent is 31,
+                   which means that we can just shift left unconditionally by 8, which leaves
+                   the implicit bit of the full 24-bit significand to be set at the most-significant bit. */
+                return (*(cast(const(uint)*) &value) << 8) | (1 << 31);
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_ftoui_sat(0.0f) == 0);
+                assert(_cvt_ftoui_sat(-0.0f) == 0);
+                assert(_cvt_ftoui_sat(float.nan) == 0);
+                assert(_cvt_ftoui_sat(-float.nan) == 0);
+                assert(_cvt_ftoui_sat(float.infinity) == 4294967295);
+                assert(_cvt_ftoui_sat(-float.infinity) == 0);
+                assert(_cvt_ftoui_sat(1.0f) == 1);
+                assert(_cvt_ftoui_sat(-1.0f) == 0);
+                assert(_cvt_ftoui_sat(2.5f) == 2);
+                assert(_cvt_ftoui_sat(-2.5f) == 0);
+                assert(_cvt_ftoui_sat(3.5f) == 3);
+                assert(_cvt_ftoui_sat(-3.5f) == 0);
+                assert(_cvt_ftoui_sat(3.49f) == 3);
+                assert(_cvt_ftoui_sat(-3.49f) == 0);
+                assert(_cvt_ftoui_sat(twoExp31Float) == 2147483648);
+                assert(_cvt_ftoui_sat(-twoExp31Float) == 0);
+                assert(_cvt_ftoui_sat(twoExp63Float) == 4294967295);
+                assert(_cvt_ftoui_sat(-twoExp63Float) == 0);
+                assert(_cvt_ftoui_sat(justUnderTwoExp63Float) == 4294967295);
+                assert(_cvt_ftoui_sat(33554432.0f) == 33554432);
+                assert(_cvt_ftoui_sat(-33554432.0f) == 0);
+                assert(_cvt_ftoui_sat(33554436.0f) == 33554436);
+                assert(_cvt_ftoui_sat(-33554436.0f) == 0);
+                assert(_cvt_ftoui_sat(70369281048576.0f) == 4294967295);
+                assert(_cvt_ftoui_sat(-70369281048576.0f) == 0);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        ulong _cvt_ftoull_sat(float value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                if (value < 0.0f || value != value)
+                {
+                    return 0;
+                }
+
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp63Float)
+                {
+                    return cast(ulong) _cvt_ftoll_fast(value);
+                }
+
+                if (value >= twoExp64Float)
+                {
+                    return ulong.max;
+                }
+
+                /* At this point, the exponent is 63.
+                   Because the exponent is at-least 23, the value will never actually contain any
+                   fractional digits, so we can just shift the significand left to get an integer. */
+
+                /* We have 23-bits stored for the significand, and we know that the exponent is 63,
+                   which means that we can just shift left unconditionally by 40, which leaves
+                   the implicit bit of the full 24-bit significand to be set at the most-significant bit. */
+                return (ulong(*(cast(const(uint)*) &value)) << 40) | (ulong(1) << 63);
+            }
+            else version (X86)
+            {
+                if (value < 0.0f || value != value)
+                {
+                    return 0;
+                }
+
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Float)
+                {
+                    return cast(ulong) _cvt_ftoi_fast(value);
+                }
+
+                if (value >= twoExp64Float)
+                {
+                    return ulong.max;
+                }
+
+                /* At this point, the exponent is at-least 31 and less-than 64.
+                   Because the exponent is at-least 23, the value will never actually contain any
+                   fractional digits, so we can just shift the significand left to get an integer. */
+
+                int asInt = *(cast(const(int)*) &value);
+
+                /* The exponent is biased by +127, but we subtract only 126 as we want the exponent
+                   to be one-higher than it actually is, so that we shift the correct number of bits
+                   after we mask the exponent by 31.
+                   E.g. with an exponent of 31 we should shift 0 bits, 32 should shift 1 bit, etc.. */
+                byte exponent = cast(byte) ((cast(ubyte) (asInt >>> 23)) - 126);
+                assert(exponent >= 32);
+                assert(exponent <= 64);
+
+                /* We have 23-bits stored for the significand, and we know that the exponent is
+                   at-least 31, which means that we can shift left unconditionally by 8, which leaves
+                   the implicit bit of the full 24-bit significand to be set at the most-significant bit.
+                   Conveniently, this means that the variable shifting for the exponent concerns only
+                   the high half (remember that this is for 32-bit mode). */
+                uint significand = (asInt << 8) | (1 << 31);
+
+                return ulong(significand) << (exponent == 64 ? 32 : (exponent & 31));
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_ftoull_sat(0.0f) == 0);
+                assert(_cvt_ftoull_sat(-0.0f) == 0);
+                assert(_cvt_ftoull_sat(float.nan) == 0);
+                assert(_cvt_ftoull_sat(-float.nan) == 0);
+                assert(_cvt_ftoull_sat(float.infinity) == 18446744073709551615);
+                assert(_cvt_ftoull_sat(-float.infinity) == 0);
+                assert(_cvt_ftoull_sat(1.0f) == 1);
+                assert(_cvt_ftoull_sat(-1.0f) == 0);
+                assert(_cvt_ftoull_sat(2.5f) == 2);
+                assert(_cvt_ftoull_sat(-2.5f) == 0);
+                assert(_cvt_ftoull_sat(3.5f) == 3);
+                assert(_cvt_ftoull_sat(-3.5f) == 0);
+                assert(_cvt_ftoull_sat(3.49f) == 3);
+                assert(_cvt_ftoull_sat(-3.49f) == 0);
+                assert(_cvt_ftoull_sat(twoExp31Float) == 2147483648);
+                assert(_cvt_ftoull_sat(-twoExp31Float) == 0);
+                assert(_cvt_ftoull_sat(twoExp63Float) == 9223372036854775808);
+                assert(_cvt_ftoull_sat(-twoExp63Float) == 0);
+                assert(_cvt_ftoull_sat(justUnderTwoExp63Float) == 9223371487098961920);
+                assert(_cvt_ftoull_sat(33554432.0f) == 33554432);
+                assert(_cvt_ftoull_sat(-33554432.0f) == 0);
+                assert(_cvt_ftoull_sat(33554436.0f) == 33554436);
+                assert(_cvt_ftoull_sat(-33554436.0f) == 0);
+                assert(_cvt_ftoull_sat(70369281048576.0f) == 70369281048576);
+                assert(_cvt_ftoull_sat(-70369281048576.0f) == 0);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _cvt_dtoi_sat(double value) @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                if (value >= twoExp31Double)
+                {
+                    return int.max;
+                }
+
+                if (value < -twoExp31Double)
+                {
+                    return int.min;
+                }
+
+                if (value != value)
+                {
+                    return 0;
+                }
+
+                return cast(int) value;
+            }
+            else
+            {
+                version (LDC_Or_GNU)
+                {
+                    mixin(q{import }, gccBuiltins, q{ : __builtin_ia32_cvttsd2si;});
+
+                    if (value >= twoExp31Double)
+                    {
+                        return int.max;
+                    }
+
+                    if (value != value)
+                    {
+                        return 0;
+                    }
+
+                    /* If value is less-than -twoExp31Double cvttsd2si will evaluate to int.min. */
+                    return __builtin_ia32_cvttsd2si(value);
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        naked;
+                        ucomisd XMM0, twoExp31DoubleInstance;
+                        mov EAX, int.max;
+                        jae tooBig; /* Jump if value is greater-or-equal to twoExp31DoubleInstance. */
+                        jp isNaN; /* Jump if value is NaN. */
+                        /* If value is less-than -twoExp31DoubleInstance cvttsd2si will evaluate to int.min. */
+                        cvttsd2si EAX, XMM0;
+                        ret;
+                    isNaN:
+                        xor EAX, EAX;
+                    tooBig:
+                        ret;
+                    }
+                }
+                else version (D_InlineAsm_X86)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        naked;
+                        movsd XMM0, [ESP + 4];
+                        ucomisd XMM0, twoExp31DoubleInstance;
+                        mov EAX, int.max;
+                        jae tooBig; /* Jump if value is greater-or-equal to twoExp31DoubleInstance. */
+                        jp isNaN; /* Jump if value is NaN. */
+                        /* If value is less-than -twoExp31DoubleInstance cvttsd2si will evaluate to int.min. */
+                        cvttsd2si EAX, XMM0;
+                        ret;
+                    isNaN:
+                        xor EAX, EAX;
+                    tooBig:
+                        ret;
+                    }
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_dtoi_sat(0.0) == 0);
+                assert(_cvt_dtoi_sat(-0.0) == 0);
+                assert(_cvt_dtoi_sat(float.nan) == 0);
+                assert(_cvt_dtoi_sat(-float.nan) == 0);
+                assert(_cvt_dtoi_sat(float.infinity) == 2147483647);
+                assert(_cvt_dtoi_sat(-float.infinity) == -2147483648);
+                assert(_cvt_dtoi_sat(1.0) == 1);
+                assert(_cvt_dtoi_sat(-1.0) == -1);
+                assert(_cvt_dtoi_sat(2.5) == 2);
+                assert(_cvt_dtoi_sat(-2.5) == -2);
+                assert(_cvt_dtoi_sat(3.5) == 3);
+                assert(_cvt_dtoi_sat(-3.5) == -3);
+                assert(_cvt_dtoi_sat(3.49) == 3);
+                assert(_cvt_dtoi_sat(-3.49) == -3);
+                assert(_cvt_dtoi_sat(twoExp31Float) == 2147483647);
+                assert(_cvt_dtoi_sat(-twoExp31Float) == -2147483648);
+                assert(_cvt_dtoi_sat(twoExp63Float) == 2147483647);
+                assert(_cvt_dtoi_sat(-twoExp63Float) == -2147483648);
+                assert(_cvt_dtoi_sat(justUnderTwoExp63Float) == 2147483647);
+                assert(_cvt_dtoi_sat(33554432.0) == 33554432);
+                assert(_cvt_dtoi_sat(-33554432.0) == -33554432);
+                assert(_cvt_dtoi_sat(33554436.0) == 33554436);
+                assert(_cvt_dtoi_sat(-33554436.0) == -33554436);
+                assert(_cvt_dtoi_sat(70369281048576.0) == 2147483647);
+                assert(_cvt_dtoi_sat(-70369281048576.0) == -2147483648);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        long _cvt_dtoll_sat(double value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                if (__ctfe)
+                {
+                    if (value >= twoExp63Double)
+                    {
+                        return long.max;
+                    }
+
+                    if (value < -twoExp63Double)
+                    {
+                        return long.min;
+                    }
+
+                    if (value != value)
+                    {
+                        return 0;
+                    }
+
+                    return cast(long) value;
+                }
+                else
+                {
+                    version (LDC_Or_GNU)
+                    {
+                        mixin(q{import }, gccBuiltins, q{ : __builtin_ia32_cvttsd2si64;});
+
+                        if (value >= twoExp63Double)
+                        {
+                            return long.max;
+                        }
+
+                        if (value != value)
+                        {
+                            return 0;
+                        }
+
+                        /* If value is less-than -twoExp63Double cvttsd2si will evaluate to long.min. */
+                        return __builtin_ia32_cvttsd2si64(value);
+                    }
+                    else version (D_InlineAsm_X86_64)
+                    {
+                        enum ubyte REX_W = 0b0100_1000;
+                        enum ubyte RAX_XMM0 = 0b11_000_000;
+
+                        asm @trusted pure nothrow @nogc
+                        {
+                            naked;
+                            ucomisd XMM0, twoExp63DoubleInstance;
+                            mov RAX, long.max;
+                            jae tooBig; /* Jump if value is greater-or-equal to twoExp63DoubleInstance. */
+                            jp isNaN; /* Jump if value is NaN. */
+                            /* If value is less-than -twoExp63DoubleInstance cvttsd2si will evaluate to long.min. */
+                            /* DMD refuses to encode `cvttsd2si RAX, XMM0`, so we'll encode it by hand. */
+                            db 0xF2, REX_W, 0x0F, 0x2C, RAX_XMM0; /* cvttsd2si RAX, XMM0 */
+                            ret;
+                        isNaN:
+                            xor EAX, EAX;
+                        tooBig:
+                            ret;
+                        }
+                    }
+                }
+            }
+            else version (X86)
+            {
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Double && value >= -twoExp31Double)
+                {
+                    return _cvt_dtoi_fast(value);
+                }
+
+                if (value >= twoExp63Double)
+                {
+                    return long.max;
+                }
+
+                if (value < -twoExp63Double)
+                {
+                    return long.min;
+                }
+
+                if (value != value)
+                {
+                    return 0;
+                }
+
+                /* At this point, the exponent is at-least 31 and less-than 64. */
+
+                long asInt = *(cast(const(long)*) &value);
+
+                uint high = cast(uint) (asInt >>> 32);
+                uint low = cast(uint) asInt;
+
+                long sign = (cast(int) high) >> 31;
+                assert(sign == 0 || sign == -1);
+
+                int exponent = ((high >>> 20) & 2047) - 1023;
+                assert(exponent >= 31);
+                assert(exponent <= 63);
+
+                ulong significand = (ulong((high & 0b00000000_00001111_11111111_11111111) | (1 << 20)) << 32) | low;
+                uint shiftCount = (exponent < 52 ? 52 : exponent) - (exponent < 52 ? exponent : 52);
+
+                if (exponent < 52)
+                {
+                    significand >>>= (shiftCount & 63);
+                }
+                else
+                {
+                    significand <<= (shiftCount & 63);
+                }
+
+                /* If the sign bit is set, we need to negate the significand; we can do that branchlessly
+                   by taking advantage of the fact that `sign` is either 0 or -1.
+                   As `(s ^ 0) - 0 == s`, whereas `(s ^ -1) - -1 == -s`. */
+                ulong adjustedSignificand = (significand ^ sign) - sign;
+                assert(sign == 0 ? adjustedSignificand == significand : adjustedSignificand == -significand);
+
+                return adjustedSignificand;
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_dtoll_sat(0.0) == 0);
+                assert(_cvt_dtoll_sat(-0.0) == 0);
+                assert(_cvt_dtoll_sat(double.nan) == 0);
+                assert(_cvt_dtoll_sat(-double.nan) == 0);
+                assert(_cvt_dtoll_sat(double.infinity) == 9223372036854775807);
+                assert(_cvt_dtoll_sat(-double.infinity) == -9223372036854775808);
+                assert(_cvt_dtoll_sat(1.0) == 1);
+                assert(_cvt_dtoll_sat(-1.0) == -1);
+                assert(_cvt_dtoll_sat(2.5) == 2);
+                assert(_cvt_dtoll_sat(-2.5) == -2);
+                assert(_cvt_dtoll_sat(3.5) == 3);
+                assert(_cvt_dtoll_sat(-3.5) == -3);
+                assert(_cvt_dtoll_sat(3.49) == 3);
+                assert(_cvt_dtoll_sat(-3.49) == -3);
+                assert(_cvt_dtoll_sat(twoExp31Double) == 2147483648);
+                assert(_cvt_dtoll_sat(-twoExp31Double) == -2147483648);
+                assert(_cvt_dtoll_sat(twoExp63Double) == 9223372036854775807);
+                assert(_cvt_dtoll_sat(-twoExp63Double) == -9223372036854775808);
+                assert(_cvt_dtoll_sat(justUnderTwoExp63Double) == 9223371487098961920);
+                assert(_cvt_dtoll_sat(33554432.0) == 33554432);
+                assert(_cvt_dtoll_sat(-33554432.0) == -33554432);
+                assert(_cvt_dtoll_sat(33554436.0) == 33554436);
+                assert(_cvt_dtoll_sat(-33554436.0) == -33554436);
+                assert(_cvt_dtoll_sat(70369281048576.0) == 70369281048576);
+                assert(_cvt_dtoll_sat(-70369281048576.0) == -70369281048576);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        uint _cvt_dtoui_sat(double value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                if (value >= twoExp32Double)
+                {
+                    return uint.max;
+                }
+
+                if (value < 0.0 || value != value)
+                {
+                    return 0;
+                }
+
+                return cast(uint) _cvt_dtoll_fast(value);
+            }
+            else version (X86)
+            {
+                if (value < 0.0 || value != value)
+                {
+                    return 0;
+                }
+
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Double)
+                {
+                    return cast(uint) _cvt_dtoi_fast(value);
+                }
+
+                if (value >= twoExp32Double)
+                {
+                    return uint.max;
+                }
+
+                /* At this point, the exponent is 31. */
+
+                /* We have 52-bits stored for the significand, and we know that the exponent is 31,
+                   which means that we can just shift left unconditionally by 21 (52 - 31), which leaves
+                   the implicit bit of the full 53-bit significand to be set at the most-significant bit. */
+                return cast(uint) (*(cast(const(ulong)*) &value) >>> 21) | (1 << 31);
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_dtoui_sat(0.0) == 0);
+                assert(_cvt_dtoui_sat(-0.0) == 0);
+                assert(_cvt_dtoui_sat(float.nan) == 0);
+                assert(_cvt_dtoui_sat(-float.nan) == 0);
+                assert(_cvt_dtoui_sat(float.infinity) == 4294967295);
+                assert(_cvt_dtoui_sat(-float.infinity) == 0);
+                assert(_cvt_dtoui_sat(1.0) == 1);
+                assert(_cvt_dtoui_sat(-1.0) == 0);
+                assert(_cvt_dtoui_sat(2.5) == 2);
+                assert(_cvt_dtoui_sat(-2.5) == 0);
+                assert(_cvt_dtoui_sat(3.5) == 3);
+                assert(_cvt_dtoui_sat(-3.5) == 0);
+                assert(_cvt_dtoui_sat(3.49) == 3);
+                assert(_cvt_dtoui_sat(-3.49) == 0);
+                assert(_cvt_dtoui_sat(twoExp31Float) == 2147483648);
+                assert(_cvt_dtoui_sat(-twoExp31Float) == 0);
+                assert(_cvt_dtoui_sat(twoExp63Float) == 4294967295);
+                assert(_cvt_dtoui_sat(-twoExp63Float) == 0);
+                assert(_cvt_dtoui_sat(justUnderTwoExp63Float) == 4294967295);
+                assert(_cvt_dtoui_sat(33554432.0) == 33554432);
+                assert(_cvt_dtoui_sat(-33554432.0) == 0);
+                assert(_cvt_dtoui_sat(33554436.0) == 33554436);
+                assert(_cvt_dtoui_sat(-33554436.0) == 0);
+                assert(_cvt_dtoui_sat(70369281048576.0) == 4294967295);
+                assert(_cvt_dtoui_sat(-70369281048576.0) == 0);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        ulong _cvt_dtoull_sat(double value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                if (value < 0.0 || value != value)
+                {
+                    return 0;
+                }
+
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp63Double)
+                {
+                    return cast(ulong) _cvt_dtoll_fast(value);
+                }
+
+                if (value >= twoExp64Double)
+                {
+                    return ulong.max;
+                }
+
+                /* At this point, the exponent is 63. */
+
+                /* We have 52-bits stored for the significand, and we know that the exponent is 63,
+                   which means that we can just shift left unconditionally by 11 (63 - 52), which leaves
+                   the implicit bit of the full 53-bit significand to be set at the most-significant bit. */
+                return (*(cast(const(ulong)*) &value) << 11) | (ulong(1) << 63);
+            }
+            else version (X86)
+            {
+                if (value < 0.0 || value != value)
+                {
+                    return 0;
+                }
+
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Double)
+                {
+                    return cast(ulong) _cvt_dtoi_fast(value);
+                }
+
+                if (value >= twoExp64Double)
+                {
+                    return ulong.max;
+                }
+
+                /* At this point, the exponent is at-least 31 and less-than 64. */
+
+                long asInt = *(cast(const(long)*) &value);
+
+                uint high = cast(uint) (asInt >>> 32);
+                uint low = cast(uint) asInt;
+
+                int exponent = ((high >>> 20) & 2047) - 1023;
+                assert(exponent >= 31);
+                assert(exponent <= 63);
+
+                ulong significand = (ulong((high & 0b00000000_00001111_11111111_11111111) | (1 << 20)) << 32) | low;
+                uint shiftCount = (exponent < 52 ? 52 : exponent) - (exponent < 52 ? exponent : 52);
+
+                if (exponent < 52)
+                {
+                    significand >>>= (shiftCount & 63);
+                }
+                else
+                {
+                    significand <<= (shiftCount & 63);
+                }
+
+                return significand;
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_dtoull_sat(0.0) == 0);
+                assert(_cvt_dtoull_sat(-0.0) == 0);
+                assert(_cvt_dtoull_sat(float.nan) == 0);
+                assert(_cvt_dtoull_sat(-float.nan) == 0);
+                assert(_cvt_dtoull_sat(float.infinity) == 18446744073709551615);
+                assert(_cvt_dtoull_sat(-float.infinity) == 0);
+                assert(_cvt_dtoull_sat(1.0) == 1);
+                assert(_cvt_dtoull_sat(-1.0) == 0);
+                assert(_cvt_dtoull_sat(2.5) == 2);
+                assert(_cvt_dtoull_sat(-2.5) == 0);
+                assert(_cvt_dtoull_sat(3.5) == 3);
+                assert(_cvt_dtoull_sat(-3.5) == 0);
+                assert(_cvt_dtoull_sat(3.49) == 3);
+                assert(_cvt_dtoull_sat(-3.49) == 0);
+                assert(_cvt_dtoull_sat(twoExp31Float) == 2147483648);
+                assert(_cvt_dtoull_sat(-twoExp31Float) == 0);
+                assert(_cvt_dtoull_sat(twoExp63Float) == 9223372036854775808);
+                assert(_cvt_dtoull_sat(-twoExp63Float) == 0);
+                assert(_cvt_dtoull_sat(justUnderTwoExp63Float) == 9223371487098961920);
+                assert(_cvt_dtoull_sat(33554432.0) == 33554432);
+                assert(_cvt_dtoull_sat(-33554432.0) == 0);
+                assert(_cvt_dtoull_sat(33554436.0) == 33554436);
+                assert(_cvt_dtoull_sat(-33554436.0) == 0);
+                assert(_cvt_dtoull_sat(70369281048576.0) == 70369281048576);
+                assert(_cvt_dtoull_sat(-70369281048576.0) == 0);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _cvt_ftoi_sent(float value) @safe pure nothrow @nogc
+        {
+            return _cvt_ftoi_fast(value);
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_ftoi_sent(0.0f) == 0);
+                assert(_cvt_ftoi_sent(-0.0f) == 0);
+                assert(_cvt_ftoi_sent(float.nan) == -2147483648);
+                assert(_cvt_ftoi_sent(-float.nan) == -2147483648);
+                assert(_cvt_ftoi_sent(float.infinity) == -2147483648);
+                assert(_cvt_ftoi_sent(-float.infinity) == -2147483648);
+                assert(_cvt_ftoi_sent(1.0f) == 1);
+                assert(_cvt_ftoi_sent(-1.0f) == -1);
+                assert(_cvt_ftoi_sent(2.5f) == 2);
+                assert(_cvt_ftoi_sent(-2.5f) == -2);
+                assert(_cvt_ftoi_sent(3.5f) == 3);
+                assert(_cvt_ftoi_sent(-3.5f) == -3);
+                assert(_cvt_ftoi_sent(3.49f) == 3);
+                assert(_cvt_ftoi_sent(-3.49f) == -3);
+                assert(_cvt_ftoi_sent(twoExp31Float) == -2147483648);
+                assert(_cvt_ftoi_sent(-twoExp31Float) == -2147483648);
+                assert(_cvt_ftoi_sent(twoExp63Float) == -2147483648);
+                assert(_cvt_ftoi_sent(-twoExp63Float) == -2147483648);
+                assert(_cvt_ftoi_sent(justUnderTwoExp63Float) == -2147483648);
+                assert(_cvt_ftoi_sent(33554432.0f) == 33554432);
+                assert(_cvt_ftoi_sent(-33554432.0f) == -33554432);
+                assert(_cvt_ftoi_sent(33554436.0f) == 33554436);
+                assert(_cvt_ftoi_sent(-33554436.0f) == -33554436);
+                assert(_cvt_ftoi_sent(70369281048576.0f) == -2147483648);
+                assert(_cvt_ftoi_sent(-70369281048576.0f) == -2147483648);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        long _cvt_ftoll_sent(float value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                return _cvt_ftoll_fast(value);
+            }
+            else version (X86)
+            {
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Float && value >= -twoExp31Float)
+                {
+                    return _cvt_ftoi_fast(value);
+                }
+
+                if (!(value < twoExp63Float && value > -twoExp63Float))
+                {
+                    return 0x80000000_00000000;
+                }
+
+                /* At this point, the exponent is at-least 31 and less-than 64.
+                   Because the exponent is at-least 23, the value will never actually contain any
+                   fractional digits, so we can just shift the significand left to get an integer. */
+
+                int asInt = *(cast(const(int)*) &value);
+
+                uint sign = asInt >> 31;
+                assert(sign == 0 || sign == -1);
+
+                /* The exponent is biased by +127, but we subtract only 126 as we want the exponent
+                   to be one-higher than it actually is, so that we shift the correct number of bits
+                   after we mask the exponent by 31.
+                   E.g. with an exponent of 31 we should shift 0 bits, 32 should shift 1 bit, etc.. */
+                byte exponent = cast(byte) ((cast(ubyte) (asInt >>> 23)) - 126);
+                assert(exponent >= 32);
+                assert(exponent <= 63);
+
+                /* We have 23-bits stored for the significand, and we know that the exponent is
+                   at-least 31, which means that we can shift left unconditionally by 8, which leaves
+                   the implicit bit of the full 24-bit significand to be set at the most-significant bit.
+                   Conveniently, this means that the variable shifting for the exponent concerns only
+                   the high half (remember that this is for 32-bit mode). */
+                uint unadjustedSignificand = (asInt << 8) | (1 << 31);
+
+                /* If the sign bit is set, we need to negate the significand; we can do that branchlessly
+                   by taking advantage of the fact that `sign` is either 0 or -1.
+                   As `(s ^ 0) - 0 == s`, whereas `(s ^ -1) - -1 == -s`. */
+                uint significand = (unadjustedSignificand ^ sign) - sign;
+                assert(sign == 0 ? significand == unadjustedSignificand : significand == -unadjustedSignificand);
+
+                uint highHalf = funnelShiftLeft(significand, sign, exponent & 31);
+
+                return (ulong(highHalf) << 32) | ulong(significand << (exponent & 31));
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_ftoll_sent(0.0f) == 0);
+                assert(_cvt_ftoll_sent(-0.0f) == 0);
+                assert(_cvt_ftoll_sent(float.nan) == -9223372036854775808);
+                assert(_cvt_ftoll_sent(-float.nan) == -9223372036854775808);
+                assert(_cvt_ftoll_sent(float.infinity) == -9223372036854775808);
+                assert(_cvt_ftoll_sent(-float.infinity) == -9223372036854775808);
+                assert(_cvt_ftoll_sent(1.0f) == 1);
+                assert(_cvt_ftoll_sent(-1.0f) == -1);
+                assert(_cvt_ftoll_sent(2.5f) == 2);
+                assert(_cvt_ftoll_sent(-2.5f) == -2);
+                assert(_cvt_ftoll_sent(3.5f) == 3);
+                assert(_cvt_ftoll_sent(-3.5f) == -3);
+                assert(_cvt_ftoll_sent(3.49f) == 3);
+                assert(_cvt_ftoll_sent(-3.49f) == -3);
+                assert(_cvt_ftoll_sent(twoExp31Float) == 2147483648);
+                assert(_cvt_ftoll_sent(-twoExp31Float) == -2147483648);
+                assert(_cvt_ftoll_sent(twoExp63Float) == -9223372036854775808);
+                assert(_cvt_ftoll_sent(-twoExp63Float) == -9223372036854775808);
+                assert(_cvt_ftoll_sent(justUnderTwoExp63Float) == 9223371487098961920);
+                assert(_cvt_ftoll_sent(33554432.0f) == 33554432);
+                assert(_cvt_ftoll_sent(-33554432.0f) == -33554432);
+                assert(_cvt_ftoll_sent(33554436.0f) == 33554436);
+                assert(_cvt_ftoll_sent(-33554436.0f) == -33554436);
+                assert(_cvt_ftoll_sent(70369281048576.0f) == 70369281048576);
+                assert(_cvt_ftoll_sent(-70369281048576.0f) == -70369281048576);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        uint _cvt_ftoui_sent(float value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                const integer = cast(ulong) _cvt_ftoll_fast(value);
+
+                return integer > uint.max ? uint.max : cast(uint) integer;
+            }
+            else version (X86)
+            {
+                if (*(cast(const(uint)*) &value) <= 0b1_01111111_00000000000000000000000)
+                {
+                    /* If the hardware can handle it, let it handle it. */
+                    if (value < twoExp31Float)
+                    {
+                        return cast(uint) _cvt_ftoi_fast(value);
+                    }
+                    else if (value < twoExp32Float)
+                    {
+                        /* At this point, the exponent is 31,
+                           Because the exponent is at-least 23, the value will never actually contain any
+                           fractional digits, so we can just shift the significand left to get an integer. */
+
+                        /* We have 23-bits stored for the significand, and we know that the exponent is 31,
+                           which means that we can just shift left unconditionally by 8, which leaves
+                           the implicit bit of the full 24-bit significand to be set at the most-significant bit. */
+                        return (*(cast(const(uint)*) &value) << 8) | (1 << 31);
+                    }
+                }
+
+                return uint.max;
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_ftoui_sent(0.0f) == 0);
+                assert(_cvt_ftoui_sent(-0.0f) == 0);
+                assert(_cvt_ftoui_sent(float.nan) == 4294967295);
+                assert(_cvt_ftoui_sent(-float.nan) == 4294967295);
+                assert(_cvt_ftoui_sent(float.infinity) == 4294967295);
+                assert(_cvt_ftoui_sent(-float.infinity) == 4294967295);
+                assert(_cvt_ftoui_sent(1.0f) == 1);
+                assert(_cvt_ftoui_sent(-1.0f) == 4294967295);
+                assert(_cvt_ftoui_sent(2.5f) == 2);
+                assert(_cvt_ftoui_sent(-2.5f) == 4294967295);
+                assert(_cvt_ftoui_sent(3.5f) == 3);
+                assert(_cvt_ftoui_sent(-3.5f) == 4294967295);
+                assert(_cvt_ftoui_sent(3.49f) == 3);
+                assert(_cvt_ftoui_sent(-3.49f) == 4294967295);
+                assert(_cvt_ftoui_sent(twoExp31Float) == 2147483648);
+                assert(_cvt_ftoui_sent(-twoExp31Float) == 4294967295);
+                assert(_cvt_ftoui_sent(twoExp63Float) == 4294967295);
+                assert(_cvt_ftoui_sent(-twoExp63Float) == 4294967295);
+                assert(_cvt_ftoui_sent(justUnderTwoExp63Float) == 4294967295);
+                assert(_cvt_ftoui_sent(33554432.0f) == 33554432);
+                assert(_cvt_ftoui_sent(-33554432.0f) == 4294967295);
+                assert(_cvt_ftoui_sent(33554436.0f) == 33554436);
+                assert(_cvt_ftoui_sent(-33554436.0f) == 4294967295);
+                assert(_cvt_ftoui_sent(70369281048576.0f) == 4294967295);
+                assert(_cvt_ftoui_sent(-70369281048576.0f) == 4294967295);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        ulong _cvt_ftoull_sent(float value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                if (value < -1.0f || value != value)
+                {
+                    return ulong.max;
+                }
+
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp63Float)
+                {
+                    return cast(ulong) _cvt_ftoll_fast(value);
+                }
+
+                if (value >= twoExp64Float)
+                {
+                    return ulong.max;
+                }
+
+                /* At this point, the exponent is 63.
+                   Because the exponent is at-least 23, the value will never actually contain any
+                   fractional digits, so we can just shift the significand left to get an integer. */
+
+                /* We have 23-bits stored for the significand, and we know that the exponent is 63,
+                   which means that we can just shift left unconditionally by 40, which leaves
+                   the implicit bit of the full 24-bit significand to be set at the most-significant bit. */
+                return (ulong(*(cast(const(uint)*) &value)) << 40) | (ulong(1) << 63);
+            }
+            else version (X86)
+            {
+                if (value < -1.0f || value != value)
+                {
+                    return ulong.max;
+                }
+
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Float)
+                {
+                    return cast(ulong) _cvt_ftoi_fast(value);
+                }
+
+                if (value >= twoExp64Float)
+                {
+                    return ulong.max;
+                }
+
+                /* At this point, the exponent is at-least 31 and less-than 64.
+                   Because the exponent is at-least 23, the value will never actually contain any
+                   fractional digits, so we can just shift the significand left to get an integer. */
+
+                int asInt = *(cast(const(int)*) &value);
+
+                /* The exponent is biased by +127, but we subtract only 126 as we want the exponent
+                   to be one-higher than it actually is, so that we shift the correct number of bits
+                   after we mask the exponent by 31.
+                   E.g. with an exponent of 31 we should shift 0 bits, 32 should shift 1 bit, etc.. */
+                byte exponent = cast(byte) ((cast(ubyte) (asInt >>> 23)) - 126);
+                assert(exponent >= 32);
+                assert(exponent <= 64);
+
+                /* We have 23-bits stored for the significand, and we know that the exponent is
+                   at-least 31, which means that we can shift left unconditionally by 8, which leaves
+                   the implicit bit of the full 24-bit significand to be set at the most-significant bit.
+                   Conveniently, this means that the variable shifting for the exponent concerns only
+                   the high half (remember that this is for 32-bit mode). */
+                uint significand = (asInt << 8) | (1 << 31);
+
+                return ulong(significand) << (exponent == 64 ? 32 : (exponent & 31));
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_ftoull_sent(0.0f) == 0);
+                assert(_cvt_ftoull_sent(-0.0f) == 0);
+                assert(_cvt_ftoull_sent(float.nan) == 18446744073709551615);
+                assert(_cvt_ftoull_sent(-float.nan) == 18446744073709551615);
+                assert(_cvt_ftoull_sent(float.infinity) == 18446744073709551615);
+                assert(_cvt_ftoull_sent(-float.infinity) == 18446744073709551615);
+                assert(_cvt_ftoull_sent(1.0f) == 1);
+                assert(_cvt_ftoull_sent(-1.0f) == 18446744073709551615);
+                assert(_cvt_ftoull_sent(2.5f) == 2);
+                assert(_cvt_ftoull_sent(-2.5f) == 18446744073709551615);
+                assert(_cvt_ftoull_sent(3.5f) == 3);
+                assert(_cvt_ftoull_sent(-3.5f) == 18446744073709551615);
+                assert(_cvt_ftoull_sent(3.49f) == 3);
+                assert(_cvt_ftoull_sent(-3.49f) == 18446744073709551615);
+                assert(_cvt_ftoull_sent(twoExp31Float) == 2147483648);
+                assert(_cvt_ftoull_sent(-twoExp31Float) == 18446744073709551615);
+                assert(_cvt_ftoull_sent(twoExp63Float) == 9223372036854775808);
+                assert(_cvt_ftoull_sent(-twoExp63Float) == 18446744073709551615);
+                assert(_cvt_ftoull_sent(justUnderTwoExp63Float) == 9223371487098961920);
+                assert(_cvt_ftoull_sent(33554432.0f) == 33554432);
+                assert(_cvt_ftoull_sent(-33554432.0f) == 18446744073709551615);
+                assert(_cvt_ftoull_sent(33554436.0f) == 33554436);
+                assert(_cvt_ftoull_sent(-33554436.0f) == 18446744073709551615);
+                assert(_cvt_ftoull_sent(70369281048576.0f) == 70369281048576);
+                assert(_cvt_ftoull_sent(-70369281048576.0f) == 18446744073709551615);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _cvt_dtoi_sent(double value) @safe pure nothrow @nogc
+        {
+            return _cvt_dtoi_fast(value);
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_dtoi_sent(0.0) == 0);
+                assert(_cvt_dtoi_sent(-0.0) == 0);
+                assert(_cvt_dtoi_sent(float.nan) == -2147483648);
+                assert(_cvt_dtoi_sent(-float.nan) == -2147483648);
+                assert(_cvt_dtoi_sent(float.infinity) == -2147483648);
+                assert(_cvt_dtoi_sent(-float.infinity) == -2147483648);
+                assert(_cvt_dtoi_sent(1.0) == 1);
+                assert(_cvt_dtoi_sent(-1.0) == -1);
+                assert(_cvt_dtoi_sent(2.5) == 2);
+                assert(_cvt_dtoi_sent(-2.5) == -2);
+                assert(_cvt_dtoi_sent(3.5) == 3);
+                assert(_cvt_dtoi_sent(-3.5) == -3);
+                assert(_cvt_dtoi_sent(3.49) == 3);
+                assert(_cvt_dtoi_sent(-3.49) == -3);
+                assert(_cvt_dtoi_sent(twoExp31Float) == -2147483648);
+                assert(_cvt_dtoi_sent(-twoExp31Float) == -2147483648);
+                assert(_cvt_dtoi_sent(twoExp63Float) == -2147483648);
+                assert(_cvt_dtoi_sent(-twoExp63Float) == -2147483648);
+                assert(_cvt_dtoi_sent(justUnderTwoExp63Float) == -2147483648);
+                assert(_cvt_dtoi_sent(33554432.0) == 33554432);
+                assert(_cvt_dtoi_sent(-33554432.0) == -33554432);
+                assert(_cvt_dtoi_sent(33554436.0) == 33554436);
+                assert(_cvt_dtoi_sent(-33554436.0) == -33554436);
+                assert(_cvt_dtoi_sent(70369281048576.0) == -2147483648);
+                assert(_cvt_dtoi_sent(-70369281048576.0) == -2147483648);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        uint _cvt_dtoui_sent(double value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                const integer = cast(ulong) _cvt_dtoll_fast(value);
+
+                return integer > uint.max ? uint.max : cast(uint) integer;
+            }
+            else version (X86)
+            {
+                if (
+                       *(cast(const(ulong)*) &value)
+                    <= 0b1_01111111111_0000000000000000000000000000000000000000000000000000
+                )
+                {
+                    /* If the hardware can handle it, let it handle it. */
+                    if (value < twoExp31Double)
+                    {
+                        return cast(uint) _cvt_dtoi_fast(value);
+                    }
+                    else if (value < twoExp32Double)
+                    {
+                        /* At this point, the exponent is 31. */
+
+                        /* We have 52-bits stored for the significand, and we know that the exponent is 31,
+                           which means that we can just shift left unconditionally by 21 (52 - 31), which leaves
+                           the implicit bit of the full 53-bit significand to be set at the most-significant bit. */
+                        return cast(uint) (*(cast(const(ulong)*) &value) >>> 21) | (1 << 31);
+                    }
+                }
+
+                return uint.max;
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_dtoui_sent(0.0) == 0);
+                assert(_cvt_dtoui_sent(-0.0) == 0);
+                assert(_cvt_dtoui_sent(float.nan) == 4294967295);
+                assert(_cvt_dtoui_sent(-float.nan) == 4294967295);
+                assert(_cvt_dtoui_sent(float.infinity) == 4294967295);
+                assert(_cvt_dtoui_sent(-float.infinity) == 4294967295);
+                assert(_cvt_dtoui_sent(1.0) == 1);
+                assert(_cvt_dtoui_sent(-1.0) == 4294967295);
+                assert(_cvt_dtoui_sent(2.5) == 2);
+                assert(_cvt_dtoui_sent(-2.5) == 4294967295);
+                assert(_cvt_dtoui_sent(3.5) == 3);
+                assert(_cvt_dtoui_sent(-3.5) == 4294967295);
+                assert(_cvt_dtoui_sent(3.49) == 3);
+                assert(_cvt_dtoui_sent(-3.49) == 4294967295);
+                assert(_cvt_dtoui_sent(twoExp31Float) == 2147483648);
+                assert(_cvt_dtoui_sent(-twoExp31Float) == 4294967295);
+                assert(_cvt_dtoui_sent(twoExp63Float) == 4294967295);
+                assert(_cvt_dtoui_sent(-twoExp63Float) == 4294967295);
+                assert(_cvt_dtoui_sent(justUnderTwoExp63Float) == 4294967295);
+                assert(_cvt_dtoui_sent(33554432.0) == 33554432);
+                assert(_cvt_dtoui_sent(-33554432.0) == 4294967295);
+                assert(_cvt_dtoui_sent(33554436.0) == 33554436);
+                assert(_cvt_dtoui_sent(-33554436.0) == 4294967295);
+                assert(_cvt_dtoui_sent(70369281048576.0) == 4294967295);
+                assert(_cvt_dtoui_sent(-70369281048576.0) == 4294967295);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        long _cvt_dtoll_sent(double value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                return _cvt_dtoll_fast(value);
+            }
+            else version (X86)
+            {
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Double && value >= -twoExp31Double)
+                {
+                    return _cvt_dtoi_fast(value);
+                }
+
+                if (!(value < twoExp63Double && value > -twoExp63Double))
+                {
+                    return 0x80000000_00000000;
+                }
+
+                /* At this point, the exponent is at-least 31 and less-than 63. */
+
+                long asInt = *(cast(const(long)*) &value);
+
+                uint high = cast(uint) (asInt >>> 32);
+                uint low = cast(uint) asInt;
+
+                long sign = (cast(int) high) >> 31;
+                assert(sign == 0 || sign == -1);
+
+                int exponent = ((high >>> 20) & 2047) - 1023;
+                assert(exponent >= 31);
+                assert(exponent <= 62);
+
+                ulong significand = (ulong((high & 0b00000000_00001111_11111111_11111111) | (1 << 20)) << 32) | low;
+                uint shiftCount = (exponent < 52 ? 52 : exponent) - (exponent < 52 ? exponent : 52);
+
+                if (exponent < 52)
+                {
+                    significand >>>= (shiftCount & 63);
+                }
+                else
+                {
+                    significand <<= (shiftCount & 63);
+                }
+
+                /* If the sign bit is set, we need to negate the significand; we can do that branchlessly
+                   by taking advantage of the fact that `sign` is either 0 or -1.
+                   As `(s ^ 0) - 0 == s`, whereas `(s ^ -1) - -1 == -s`. */
+                ulong adjustedSignificand = (significand ^ sign) - sign;
+                assert(sign == 0 ? adjustedSignificand == significand : adjustedSignificand == -significand);
+
+                return adjustedSignificand;
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_dtoll_sent(0.0) == 0);
+                assert(_cvt_dtoll_sent(-0.0) == 0);
+                assert(_cvt_dtoll_sent(float.nan) == -9223372036854775808);
+                assert(_cvt_dtoll_sent(-float.nan) == -9223372036854775808);
+                assert(_cvt_dtoll_sent(float.infinity) == -9223372036854775808);
+                assert(_cvt_dtoll_sent(-float.infinity) == -9223372036854775808);
+                assert(_cvt_dtoll_sent(1.0) == 1);
+                assert(_cvt_dtoll_sent(-1.0) == -1);
+                assert(_cvt_dtoll_sent(2.5) == 2);
+                assert(_cvt_dtoll_sent(-2.5) == -2);
+                assert(_cvt_dtoll_sent(3.5) == 3);
+                assert(_cvt_dtoll_sent(-3.5) == -3);
+                assert(_cvt_dtoll_sent(3.49) == 3);
+                assert(_cvt_dtoll_sent(-3.49) == -3);
+                assert(_cvt_dtoll_sent(twoExp31Float) == 2147483648);
+                assert(_cvt_dtoll_sent(-twoExp31Float) == -2147483648);
+                assert(_cvt_dtoll_sent(twoExp63Float) == -9223372036854775808);
+                assert(_cvt_dtoll_sent(-twoExp63Float) == -9223372036854775808);
+                assert(_cvt_dtoll_sent(justUnderTwoExp63Float) == 9223371487098961920);
+                assert(_cvt_dtoll_sent(33554432.0) == 33554432);
+                assert(_cvt_dtoll_sent(-33554432.0) == -33554432);
+                assert(_cvt_dtoll_sent(33554436.0) == 33554436);
+                assert(_cvt_dtoll_sent(-33554436.0) == -33554436);
+                assert(_cvt_dtoll_sent(70369281048576.0) == 70369281048576);
+                assert(_cvt_dtoll_sent(-70369281048576.0) == -70369281048576);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        extern(C)
+        pragma(inline, true)
+        ulong _cvt_dtoull_sent(double value) @trusted pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                if (value < -1.0 || value != value)
+                {
+                    return ulong.max;
+                }
+
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp63Double)
+                {
+                    return cast(ulong) _cvt_dtoll_fast(value);
+                }
+
+                if (value >= twoExp64Double)
+                {
+                    return ulong.max;
+                }
+
+                /* At this point, the exponent is 63. */
+
+                /* We have 52-bits stored for the significand, and we know that the exponent is 63,
+                   which means that we can just shift left unconditionally by 11 (63 - 52), which leaves
+                   the implicit bit of the full 53-bit significand to be set at the most-significant bit. */
+                return (*(cast(const(ulong)*) &value) << 11) | (ulong(1) << 63);
+            }
+            else version (X86)
+            {
+                if (value < -1.0 || value != value)
+                {
+                    return ulong.max;
+                }
+
+                /* If the hardware can handle it, let it handle it. */
+                if (value < twoExp31Double)
+                {
+                    return cast(ulong) _cvt_dtoi_fast(value);
+                }
+
+                if (value >= twoExp64Double)
+                {
+                    return ulong.max;
+                }
+
+                /* At this point, the exponent is at-least 31 and less-than 64. */
+
+                long asInt = *(cast(const(long)*) &value);
+
+                uint high = cast(uint) (asInt >>> 32);
+                uint low = cast(uint) asInt;
+
+                int exponent = ((high >>> 20) & 2047) - 1023;
+                assert(exponent >= 31);
+                assert(exponent <= 63);
+
+                ulong significand = (ulong((high & 0b00000000_00001111_11111111_11111111) | (1 << 20)) << 32) | low;
+                uint shiftCount = (exponent < 52 ? 52 : exponent) - (exponent < 52 ? exponent : 52);
+
+                if (exponent < 52)
+                {
+                    significand >>>= (shiftCount & 63);
+                }
+                else
+                {
+                    significand <<= (shiftCount & 63);
+                }
+
+                return significand;
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_cvt_dtoull_sent(0.0) == 0);
+                assert(_cvt_dtoull_sent(-0.0) == 0);
+                assert(_cvt_dtoull_sent(float.nan) == 18446744073709551615);
+                assert(_cvt_dtoull_sent(-float.nan) == 18446744073709551615);
+                assert(_cvt_dtoull_sent(float.infinity) == 18446744073709551615);
+                assert(_cvt_dtoull_sent(-float.infinity) == 18446744073709551615);
+                assert(_cvt_dtoull_sent(1.0) == 1);
+                assert(_cvt_dtoull_sent(-1.0) == 18446744073709551615);
+                assert(_cvt_dtoull_sent(2.5) == 2);
+                assert(_cvt_dtoull_sent(-2.5) == 18446744073709551615);
+                assert(_cvt_dtoull_sent(3.5) == 3);
+                assert(_cvt_dtoull_sent(-3.5) == 18446744073709551615);
+                assert(_cvt_dtoull_sent(3.49) == 3);
+                assert(_cvt_dtoull_sent(-3.49) == 18446744073709551615);
+                assert(_cvt_dtoull_sent(twoExp31Float) == 2147483648);
+                assert(_cvt_dtoull_sent(-twoExp31Float) == 18446744073709551615);
+                assert(_cvt_dtoull_sent(twoExp63Float) == 9223372036854775808);
+                assert(_cvt_dtoull_sent(-twoExp63Float) == 18446744073709551615);
+                assert(_cvt_dtoull_sent(justUnderTwoExp63Float) == 9223371487098961920);
+                assert(_cvt_dtoull_sent(33554432.0) == 33554432);
+                assert(_cvt_dtoull_sent(-33554432.0) == 18446744073709551615);
+                assert(_cvt_dtoull_sent(33554436.0) == 33554436);
+                assert(_cvt_dtoull_sent(-33554436.0) == 18446744073709551615);
+                assert(_cvt_dtoull_sent(70369281048576.0) == 70369281048576);
+                assert(_cvt_dtoull_sent(-70369281048576.0) == 18446744073709551615);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        void __halt() @safe nothrow @nogc
+        {
+            version (LDC_Or_GNU)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    "hlt";
+                }
+            }
+            else version (InlineAsm_X86_64_Or_X86)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    hlt;
+                }
+            }
+            else
+            {
+                static assert(false);
+            }
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte __readgsbyte(uint Offset) nothrow @nogc
+        {
+            return manipulateMemoryThroughTIBSegmentRegister!(ubyte)(Offset);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ushort __readgsword(uint Offset) nothrow @nogc
+        {
+            return manipulateMemoryThroughTIBSegmentRegister!(ushort)(Offset);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        uint __readgsdword(uint Offset) nothrow @nogc
+        {
+            return manipulateMemoryThroughTIBSegmentRegister!(uint)(Offset);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ulong __readgsqword(uint Offset) nothrow @nogc
+        {
+            return manipulateMemoryThroughTIBSegmentRegister!(ulong)(Offset);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __writegsbyte(uint Offset, ubyte Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, null, ubyte)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __writegsword(uint Offset, ushort Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, null, ushort)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __writegsdword(uint Offset, uint Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, null, uint)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __writegsqword(uint Offset, ulong Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, null, ulong)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __addgsbyte(uint Offset, ubyte Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, "+", ubyte)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __addgsword(uint Offset, ushort Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, "+", ushort)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __addgsdword(uint Offset, uint Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, "+", uint)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __addgsqword(uint Offset, ulong Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, "+", ulong)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __incgsbyte(uint Offset) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(ubyte, "++")(Offset);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __incgsword(uint Offset) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(ushort, "++")(Offset);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __incgsdword(uint Offset) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(uint, "++")(Offset);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __incgsqword(uint Offset) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(ulong, "++")(Offset);
+        }
+    }
+
+    version (X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte __readfsbyte(uint Offset) nothrow @nogc
+        {
+            return manipulateMemoryThroughTIBSegmentRegister!(ubyte)(Offset);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ushort __readfsword(uint Offset) nothrow @nogc
+        {
+            return manipulateMemoryThroughTIBSegmentRegister!(ushort)(Offset);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        uint __readfsdword(uint Offset) nothrow @nogc
+        {
+            return manipulateMemoryThroughTIBSegmentRegister!(uint)(Offset);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ulong __readfsqword(uint Offset) nothrow @nogc
+        {
+            return manipulateMemoryThroughTIBSegmentRegister!(ulong)(Offset);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __writefsbyte(uint Offset, ubyte Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, null, ubyte)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __writefsword(uint Offset, ushort Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, null, ushort)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __writefsdword(uint Offset, uint Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, null, uint)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __writefsqword(uint Offset, ulong Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, null, ulong)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __addfsbyte(uint Offset, ubyte Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, "+", ubyte)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __addfsword(uint Offset, ushort Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, "+", ushort)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __addfsdword(uint Offset, uint Data) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(void, "+", uint)(Offset, Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __incfsbyte(uint Offset) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(ubyte, "++")(Offset);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __incfsword(uint Offset) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(ushort, "++")(Offset);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __incfsdword(uint Offset) nothrow @nogc
+        {
+            manipulateMemoryThroughTIBSegmentRegister!(uint, "++")(Offset);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        private
+        mixin(Args.length == 0 && operator == null ? "Integer" : "void")
+        manipulateMemoryThroughTIBSegmentRegister(
+            Integer = void,
+            string operator = null,
+            Args...
+        )(
+            uint offset,
+            Args args
+        ) nothrow @nogc
+        if (
+              Args.length == 1
+            ? (is(Integer == void) && __traits(isIntegral, Args[0]) && (operator == null || operator == "+"))
+            : (Args.length == 0 && __traits(isIntegral, Integer) && (operator == null || operator == "++"))
+        )
+        {
+            enum bool reading = Args.length == 0 && operator == null;
+            static if (Args.length == 0) alias Int = Integer; else alias Int = Args[0];
+
+            version (LDC)
+            {
+                import core.bitop : bsr;
+                import ldc.llvmasm : __ir;
+
+                     version (X86) enum addressSpace = "addrspace(257)";
+                else version (X86_64) enum addressSpace = "addrspace(256)";
+
+                enum size = Int.sizeof.bsr;
+                enum type = ["i8", "i16", "i32", "i64"][size];
+                enum ptr = llvmIRPtr!(type, addressSpace);
+
+                enum loadValue = "%address = inttoptr i32 %0 to " ~ ptr ~ ";
+                                  %data = load " ~ type ~ ", " ~ ptr ~ " %address;\n";
+
+                static if (reading)
+                {
+                    return __ir!(
+                        loadValue ~ "ret " ~ type ~ " %data;",
+                        Int
+                    )(offset);
+                }
+                else
+                {
+                    static if (operator == null)
+                    {
+                        enum code = "%address = inttoptr i32 %0 to " ~ ptr ~ ";
+                                     store " ~ type ~ " %1, " ~ ptr ~ " %address;";
+                    }
+                    else static if (operator == "++")
+                    {
+                        enum code = loadValue
+                                    ~ "%changed = add " ~ type ~ " %data, 1;
+                                    store " ~ type ~ " %changed, " ~ ptr ~ " %address;";
+                    }
+                    else static if (operator == "+")
+                    {
+                        enum code = loadValue
+                                    ~ "%changed = add " ~ type ~ " %data, %1;
+                                    store " ~ type ~ " %changed, " ~ ptr ~ " %address;";
+                    }
+
+                    __ir!(code, void)(offset, args);
+                }
+            }
+            else version (GNU)
+            {
+                version (X86)
+                {
+                    enum segment = "fs";
+                    enum canMoveEightBytes = false;
+                }
+                else version (X86_64)
+                {
+                    enum segment = "gs";
+                    enum canMoveEightBytes = true;
+                }
+
+                static if (reading)
+                {
+                    static if (Int.sizeof <= 4 || canMoveEightBytes)
+                    {
+                        Int result;
+
+                        asm nothrow @nogc
+                        {
+                            "mov " ~ segment ~ ":(%1), %0 " : "=r" (result) : "ri" (offset) : "memory";
+                        }
+
+                        return result;
+                    }
+                    else
+                    {
+                        uint lo;
+                        uint hi;
+
+                        asm nothrow @nogc
+                        {
+                              "mov " ~ segment ~ ":(%2), %0
+                               mov " ~ segment ~ ":4(%2), %1"
+                            : "=&r" (lo), "=r" (hi)
+                            : "ri" (offset)
+                            : "memory";
+                        }
+
+                        return lo | (Int(hi) << 32);
+                    }
+                }
+                else
+                {
+                    static if (operator == null)
+                    {
+                        static if (Int.sizeof <= 4 || canMoveEightBytes)
+                        {
+                            asm nothrow @nogc
+                            {
+                                "mov %1, " ~ segment ~ ":(%0)" : : "ri" (offset), "r" (args[0]) : "memory";
+                            }
+                        }
+                        else
+                        {
+                            asm nothrow @nogc
+                            {
+                                  "mov %1, " ~ segment ~ ":(%0)
+                                   mov %2, " ~ segment ~ ":4(%0)"
+                                :
+                                : "ri" (offset), "r" (cast(uint) args[0]), "r" (cast(uint) (args[0] >>> 32))
+                                : "memory";
+                            }
+                        }
+                    }
+                    else static if (operator == "++")
+                    {
+                        import core.bitop : bsr;
+
+                        enum char suffix = "bwlq"[Int.sizeof.bsr];
+
+                        asm nothrow @nogc
+                        {
+                            "inc" ~ suffix ~ " " ~ segment ~ ":(%0)" : : "ri" (offset) : "memory", "cc";
+                        }
+                    }
+                    else static if (operator == "+")
+                    {
+                        asm nothrow @nogc
+                        {
+                            "add %1, " ~ segment ~ ":(%0)" : : "ri" (offset), "r" (args[0]) : "memory", "cc";
+                        }
+                    }
+                }
+            }
+            else version (InlineAsm_X86_64_Or_X86)
+            {
+                import core.bitop : bsr;
+
+                enum size = Int.sizeof.bsr;
+
+                version (D_InlineAsm_X86_64)
+                {
+                    static if (reading)
+                    {
+                        mixin(
+                            /* ECX is offset. */
+                            "asm nothrow @nogc
+                             {
+                                 naked;
+                                 mov " ~ ["AL", "AX", "EAX", "RAX"][size] ~ ", GS:[ECX];
+                                 ret;
+                             }"
+                        );
+                    }
+                    else
+                    {
+                        static if (operator == "++")
+                        {
+                            mixin(
+                                /* ECX is offset. */
+                                "asm nothrow @nogc
+                                 {
+                                     naked;
+                                     inc " ~ ["ubyte", "word", "dword", "qword"][size] ~ " ptr GS:[ECX];
+                                     ret;
+                                 }"
+                            );
+                        }
+                        else static if (operator == "+" || operator == null)
+                        {
+                            enum op = operator == "+" ? "add" : "mov";
+
+                            mixin(
+                                /* ECX is offset; EDX is args[0]. */
+                                "asm nothrow @nogc
+                                 {
+                                     naked;
+                                     " ~ op ~ " GS:[ECX], " ~ ["DL", "DX", "EDX", "RDX"][size] ~ ";
+                                     ret;
+                                 }"
+                            );
+                        }
+                    }
+                }
+                else version (D_InlineAsm_X86)
+                {
+                    static if (reading)
+                    {
+                        static if (size == 3)
+                        {
+                            asm nothrow @nogc
+                            {
+                                naked;
+                                mov ECX, [ESP + 4]; /* offset. */
+                                mov EAX, FS:[ECX];
+                                mov EDX, FS:[ECX + 4];
+                                ret;
+                            }
+                        }
+                        else static if (size <= 2)
+                        {
+                            mixin(
+                                "asm nothrow @nogc
+                                 {
+                                     naked;
+                                     mov ECX, [ESP + 4]; /* offset. */
+                                     mov " ~ ["AL", "AX", "EAX"][size] ~ ", FS:[ECX];
+                                     ret;
+                                 }"
+                            );
+                        }
+                    }
+                    else
+                    {
+                        static if (size == 3)
+                        {
+                            asm nothrow @nogc
+                            {
+                                naked;
+                                mov ECX, [ESP +  4]; /* offset. */
+                                mov EAX, [ESP +  8]; /* Low half of args[0]. */
+                                mov EDX, [ESP + 12]; /* High half of args[0]. */
+                                mov FS:[ECX], EAX;
+                                mov FS:[ECX + 4], EDX;
+                                ret;
+                            }
+                        }
+                        else static if (size <= 2)
+                        {
+                            static if (operator == "++")
+                            {
+                                mixin(
+                                    "asm nothrow @nogc
+                                     {
+                                         naked;
+                                         mov ECX, [ESP + 4]; /* offset. */
+                                         inc " ~ ["ubyte", "word", "dword"][size] ~ " ptr FS:[ECX];
+                                         ret;
+                                     }"
+                                );
+                            }
+                            else static if (operator == "+" || operator == null)
+                            {
+                                enum op = operator == "+" ? "add" : "mov";
+                                enum source = ["DL", "DX", "EDX"][size];
+
+                                mixin(
+                                    "asm nothrow @nogc
+                                     {
+                                         naked;
+                                         mov ECX, [ESP + 4]; /* offset. */
+                                         mov " ~ source ~ ", [ESP + 8]; /* args[0] */
+                                         " ~ op ~ " FS:[ECX], " ~ source ~ ";
+                                         ret;
+                                     }"
+                                );
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                static assert(false);
+            }
+        }
+
+        version (Windows)
+        {
+            @trusted nothrow @nogc unittest
+            {
+                import core.sys.windows.winbase : GetLastError, SetLastError;
+
+                /* The Win32 last-error is stored in the TIB, at an offset of 13-pointers.
+                   Immediately after it is is the number of critical-sections.
+                   We can use GetLastError and SetLastError as a known good implementation of
+                   reading and writing to FS/GS-segmented memory, and so long as we restore
+                   the critical-section count to its original value afterwards, we can use it
+                   to test the reading and writing of 8-byte values. */
+
+                enum lastErrorOffset = size_t.sizeof * 13;
+                enum criticalSectionCountOffset = lastErrorOffset + 4;
+
+                version (X86_64) enum prefix = 'g'; else version (X86) enum prefix = 'f';
+
+                alias addByte = mixin("__add", prefix, "sbyte");
+                alias addDword = mixin("__add", prefix, "sdword");
+                alias addWord = mixin("__add", prefix, "sword");
+                alias incByte = mixin("__inc", prefix, "sbyte");
+                alias incDword = mixin("__inc", prefix, "sdword");
+                alias incWord = mixin("__inc", prefix, "sword");
+                alias readByte = mixin("__read", prefix, "sbyte");
+                alias readDword = mixin("__read", prefix, "sdword");
+                alias readWord = mixin("__read", prefix, "sword");
+                alias readQword = mixin("__read", prefix, "sqword");
+                alias writeByte = mixin("__write", prefix, "sbyte");
+                alias writeDword = mixin("__write", prefix, "sdword");
+                alias writeWord = mixin("__write", prefix, "sword");
+                alias writeQword = mixin("__write", prefix, "sqword");
+
+                SetLastError(0x01234567);
+                assert(GetLastError()             == 0x01234567);
+                assert(readDword(lastErrorOffset) == 0x01234567);
+                assert(readWord(lastErrorOffset)  ==     0x4567);
+                assert(readByte(lastErrorOffset)  ==       0x67);
+
+                writeDword(lastErrorOffset, 0x89ABCDEF);
+                assert(GetLastError() == 0x89ABCDEF);
+                writeWord(lastErrorOffset, 0x0123);
+                assert(GetLastError() == 0x89AB0123);
+                writeByte(lastErrorOffset, 0x45);
+                assert(GetLastError() == 0x89AB0145);
+
+                auto originalCriticalSectionCount = readDword(criticalSectionCountOffset);
+
+                writeDword(criticalSectionCountOffset, 0xCAFEBEEF);
+                assert(readQword(lastErrorOffset) == 0xCAFEBEEF_89AB0145);
+
+                writeQword(lastErrorOffset, 0x01234567_89ABCDEF);
+                assert(readDword(lastErrorOffset)            == 0x89ABCDEF);
+                assert(readDword(criticalSectionCountOffset) == 0x01234567);
+
+                incDword(lastErrorOffset);
+                assert(readDword(lastErrorOffset) == 0x89ABCDF0);
+
+                incWord(lastErrorOffset + 2);
+                assert(readDword(lastErrorOffset) == 0x89ACCDF0);
+
+                incByte(lastErrorOffset + 3);
+                assert(readDword(lastErrorOffset) == 0x8AACCDF0);
+
+                addDword(lastErrorOffset, uint(12));
+                assert(readDword(lastErrorOffset) == 0x8AACCDFC);
+
+                addWord(lastErrorOffset + 2, ushort(3));
+                assert(readDword(lastErrorOffset) == 0x8AAFCDFC);
+
+                addByte(lastErrorOffset + 3, 4);
+                assert(readDword(lastErrorOffset) == 0x8EAFCDFC);
+
+                version (X86_64)
+                {
+                    assert(__readgsqword(lastErrorOffset) == 0x01234567_8EAFCDFC);
+
+                    __incgsqword(lastErrorOffset);
+                    assert(__readgsqword(lastErrorOffset) == 0x01234567_8EAFCDFD);
+
+                    __addgsqword(lastErrorOffset, ulong(2));
+                    assert(__readgsqword(lastErrorOffset) == 0x01234567_8EAFCDFF);
+                }
+
+                writeDword(criticalSectionCountOffset, originalCriticalSectionCount);
+            }
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    void __debugbreak() @safe pure nothrow @nogc
+    {
+        version (LDC)
+        {
+            import ldc.intrinsics : llvm_debugtrap;
+            llvm_debugtrap();
+        }
+        else version (GNU)
+        {
+                 version (X86_64_Or_X86) enum code = "int $3";
+            else version (ARM) enum code = "udf #0xFE";
+            else version (AArch64) enum code = "brk #0xF000";
+
+            asm @trusted pure nothrow @nogc
+            {
+                "" ~ code : : : "cc";
+            }
+        }
+        else version (InlineAsm_X86_64_Or_X86)
+        {
+            asm @trusted pure nothrow @nogc
+            {
+                naked;
+                int 3;
+                ret;
+            }
+        }
+        else
+        {
+            static assert(false);
+        }
+    }
+
+    version (none)
+    {
+        @safe pure nothrow @nogc unittest
+        {
+            /* Run the program in a debugger and it should break here. */
+            __debugbreak();
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    noreturn __fastfail(uint code) @safe pure nothrow @nogc
+    {
+        if (__ctfe)
+        {
+            version (D_BetterC)
+            {
+                assert(false, "__fastfail(code)");
+            }
+            else
+            {
+                import core.internal.string : unsignedToTempString;
+                assert(false, "__fastfail(" ~ unsignedToTempString(code) ~ ")");
+            }
+        }
+        else
+        {
+            version (LDC_Or_GNU)
+            {
+                version (X86_64_Or_X86)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        "int $41" : : "c" (code);
+                    }
+                }
+                else version (ARM)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        "mov r0, %0
+                         udf #0xFB"
+                        :
+                        : "ir" (code);
+                    }
+                }
+                else version (AArch64)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        "mov x0, %0
+                         brk #0xF003"
+                        :
+                        : "ir" (code);
+                    }
+                }
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @trusted pure nothrow @nogc
+                {
+                    /* ECX is code. */
+                    naked;
+                    int 41;
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @trusted pure nothrow @nogc
+                {
+                    naked;
+                    mov ECX, [ESP + 4]; /* code. */
+                    int 41;
+                    ret;
+                }
+            }
+            else
+            {
+                static assert(false);
+            }
+
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir_pure;
+                __ir_pure!("unreachable", noreturn)();
+            }
+            else version (GNU)
+            {
+                import gcc.builtins : __builtin_unreachable;
+                __builtin_unreachable();
+                assert(false);
+            }
+            else
+            {
+                assert(false);
+            }
+        }
+    }
+
+    version (none)
+    {
+        @safe pure nothrow @nogc unittest
+        {
+            /* Run the program and it should crash here. Afterwards, in Windows PowerShell,
+               run `Get-EventLog -LogName Application -EntryType Error -Newest 1 | Format-List`,
+               and assuming no others errors have happened since, this program's crash should be returned
+               and the "Exception code" in the `Message` field should be 0xc0000409. */
+            __fastfail(7);
+        }
+    }
+
+    @safe pure nothrow @nogc
+    {
+        static assert(__traits(compiles, __fastfail(7)));
+
+        static assert(
+            !__traits(
+                compiles,
+                ()
+                {
+                    enum bool fastFailDuringCTFE = ()
+                    {
+                        __fastfail(7);
+                        return true;
+                    }();
+                }
+            )
+        );
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        void __faststorefence() @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                /* Just do nothing. */
+            }
+            else
+            {
+                version (LDC_Or_GNU)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        "lock orl $0, (%%rsp)" : : : "cc";
+                    }
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        naked;
+                        lock; or dword ptr [RSP], 0;
+                        ret;
+                    }
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                __faststorefence();
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    void _disable() @safe nothrow @nogc
+    {
+        version (LDC_Or_GNU)
+        {
+            version (X86_64_Or_X86) enum code = "cli";
+            else version (ARM) enum code = "cpsid i";
+            else version (AArch64) enum code = "msr daifset, #2";
+
+            asm @trusted pure nothrow @nogc
+            {
+                "" ~ code : : : "cc";
+            }
+        }
+        else version (InlineAsm_X86_64_Or_X86)
+        {
+            asm @trusted pure nothrow @nogc
+            {
+                cli;
+            }
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    void _enable() @safe nothrow @nogc
+    {
+        version (LDC_Or_GNU)
+        {
+            version (X86_64_Or_X86) enum code = "sti";
+            else version (ARM) enum code = "cpsie i";
+            else version (AArch64) enum code = "msr daifclr, #2";
+
+            asm @trusted pure nothrow @nogc
+            {
+                "" ~ code : : : "cc";
+            }
+        }
+        else version (InlineAsm_X86_64_Or_X86)
+        {
+            asm @trusted pure nothrow @nogc
+            {
+                sti;
+            }
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    int _interlockedadd(scope shared(int)* Addend, int Value) @safe pure nothrow @nogc
+    {
+        return interlockedAdd(Addend, Value);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    long _interlockedadd64(scope shared(long)* Addend, long Value) @safe pure nothrow @nogc
+    {
+        import core.internal.atomic : atomicFetchAdd;
+
+        static if (__traits(compiles, atomicFetchAdd(Addend, Value)))
+        {
+            if (__ctfe)
+            {
+                return *((a) @trusted => cast(long*) Addend)(Addend) += Value;
+            }
+            else
+            {
+                return atomicFetchAdd(Addend, Value) + Value;
+            }
+        }
+        else
+        {
+            return interlockedOp!("rmw_add", "add_8", "+", MemoryOrder.seq, true)(Addend, Value) + Value;
+        }
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedAdd(scope shared(int)* Addend, int Value) @safe pure nothrow @nogc
+        {
+            return interlockedAdd(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedAdd_acq(scope shared(int)* Addend, int Value) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedAdd_rel(scope shared(int)* Addend, int Value) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq_rel)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedAdd_nf(scope shared(int)* Addend, int Value) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.raw)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedAdd64(scope shared(long)* Addend, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedAdd(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedAdd64_acq(scope shared(long)* Addend, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedAdd64_rel(scope shared(long)* Addend, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq_rel)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedAdd64_nf(scope shared(long)* Addend, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.raw)(Addend, Value);
+        }
+    }
+
+    /* This is trusted so that it's @safe without DIP1000 enabled. */
+    @trusted pure nothrow @nogc unittest
+    {
+        static bool test()
+        {
+            shared int intValue = 0x2ACD0123;
+            shared long longValue = 0x12345678_2ACD0123;
+
+            assert(_interlockedadd(&intValue, 0x10000000) == 0x3ACD0123);
+            assert(intValue == 0x3ACD0123);
+
+            assert(_interlockedadd64(&longValue, 0x10000000_00000001) == 0x22345678_2ACD0124);
+            assert(longValue == 0x22345678_2ACD0124);
+
+            version (AArch64_Or_ARM)
+            {
+                assert(_InterlockedAdd(&intValue, 0x10000000) == 0x4ACD0123);
+                assert(intValue == 0x4ACD0123);
+                assert(_InterlockedAdd_acq(&intValue, 0x10000000) == 0x5ACD0123);
+                assert(intValue == 0x5ACD0123);
+                assert(_InterlockedAdd_rel(&intValue, 0x10000000) == 0x6ACD0123);
+                assert(intValue == 0x6ACD0123);
+                assert(_InterlockedAdd_nf(&intValue, 0x10000000) == 0x7ACD0123);
+                assert(intValue == 0x7ACD0123);
+
+                assert(_InterlockedAdd64(&longValue, 0x10000000_00000001) == 0x32345678_2ACD0125);
+                assert(longValue == 0x32345678_2ACD0125);
+                assert(_InterlockedAdd64_acq(&longValue, 0x10000000_00000001) == 0x42345678_2ACD0126);
+                assert(longValue == 0x42345678_2ACD0126);
+                assert(_InterlockedAdd64_rel(&longValue, 0x10000000_00000001) == 0x52345678_2ACD0127);
+                assert(longValue == 0x52345678_2ACD0127);
+                assert(_InterlockedAdd64_nf(&longValue, 0x10000000_00000001) == 0x62345678_2ACD0128);
+                assert(longValue == 0x62345678_2ACD0128);
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    version (X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedAddLargeStatistic(scope shared(long)* Addend, int Value) @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                *((a) @trusted => cast(long*) a)(Addend) += Value;
+                return Value;
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import ldc.llvmasm : __ir_pure;
+
+                    scope highHalf = ((a) @trusted => &(cast(shared(uint)*) Addend)[1])(Addend);
+
+                    enum ptr = llvmIRPtr!"i32" ~ " elementtype(i32)";
+
+                    __ir_pure!(
+                        `call void asm sideeffect inteldialect
+                             "lock add dword ptr $0, $2
+                              jnc pastAddingOfCarry_${:uid}
+                              lock adc dword ptr $1, 0
+                         pastAddingOfCarry_${:uid}:",
+                             "=*m,=*m,ir,~{memory},~{flags}"
+                             (` ~ ptr ~ ` %0, ` ~ ptr ~ ` %1, i32 %2)`,
+                        void
+                    )(Addend, highHalf, Value);
+
+                    return Value;
+                }
+                else version (GNU)
+                {
+                    scope highHalf = ((a) @trusted => &(cast(shared(uint)*) Addend)[1])(Addend);
+
+                    asm @trusted pure nothrow @nogc
+                    {
+                        "lock addl %2, %0
+                         jnc pastAddingOfCarry_%=
+                         lock adcl $0, %1
+                    pastAddingOfCarry_%=:"
+                        : "+m" (*cast(shared(uint)*) Addend), "+m" (*highHalf)
+                        : "ir" (Value)
+                        : "memory", "cc";
+                    }
+
+                    return Value;
+                }
+                else version (D_InlineAsm_X86)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        naked;
+                        mov EDX, [ESP + 4]; /* Addend. */
+                        mov EAX, [ESP + 8]; /* Value. */
+                        lock; add [EDX], EAX;
+                        jnc pastAddingOfCarry; /* If there's no carry we needn't add it. */
+                        lock; adc [EDX + 4], 0;
+                    pastAddingOfCarry:
+                        ret;
+                    }
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                shared long value = 0x12345678_2ACD0123;
+
+                assert(_InterlockedAddLargeStatistic(&value, 0x10000001) == 0x10000001);
+                assert(value == 0x12345678_3ACD0124);
+
+                assert(_InterlockedAddLargeStatistic(&value, 0x62997F6F) == 0x62997F6F);
+                assert(value == 0x12345678_9D668093);
+
+                assert(_InterlockedAddLargeStatistic(&value, 0x62997F6F) == 0x62997F6F);
+                assert(value == 0x12345679_00000002);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    int _InterlockedAnd(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+    {
+        return interlockedOp!("rmw_and", "and_4", "&")(value, mask);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    byte _InterlockedAnd8(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+    {
+        return interlockedOp!("rmw_and", "and_1", "&")(value, mask);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    short _InterlockedAnd16(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+    {
+        return interlockedOp!("rmw_and", "and_2", "&")(value, mask);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    long _interlockedand64(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+    {
+        return interlockedOp!("rmw_and", "and_8", "&")(value, mask);
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedAnd_acq(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_4", "&", MemoryOrder.acq)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedAnd_rel(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_4", "&", MemoryOrder.acq_rel)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedAnd_nf(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_4", "&", MemoryOrder.raw)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedAnd8_acq(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_1", "&", MemoryOrder.acq)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedAnd8_rel(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_1", "&", MemoryOrder.acq_rel)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedAnd8_nf(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_1", "&", MemoryOrder.raw)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedAnd16_acq(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_2", "&", MemoryOrder.acq)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedAnd16_rel(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_2", "&", MemoryOrder.acq_rel)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedAnd16_nf(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_2", "&", MemoryOrder.raw)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedAnd64_acq(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_8", "&", MemoryOrder.acq)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedAnd64_rel(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_8", "&", MemoryOrder.acq_rel)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedAnd64_nf(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_8", "&", MemoryOrder.raw)(value, mask);
+        }
+    }
+
+    version (X86_64_Or_AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedAnd64(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_8", "&")(value, mask);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedAnd_np(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_4", "&", MemoryOrder.seq, true)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedAnd8_np(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_1", "&", MemoryOrder.seq, true)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedAnd16_np(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_2", "&", MemoryOrder.seq, true)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedAnd64_np(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_and", "and_8", "&", MemoryOrder.seq, true)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedAnd64_HLEAcquire(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOpHLE!(true, "&", "and")(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedAnd64_HLERelease(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOpHLE!(false, "&", "and")(value, mask);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedAnd_HLEAcquire(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOpHLE!(true, "&", "and")(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedAnd_HLERelease(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOpHLE!(false, "&", "and")(value, mask);
+        }
+    }
+
+    /* This is trusted so that it's @safe without DIP1000 enabled. */
+    @trusted pure nothrow @nogc unittest
+    {
+        static bool test()
+        {
+            alias t(alias symbol, T) = interlockedOpTest!("&", symbol, T);
+
+            t!(_InterlockedAnd, int)();
+            t!(_InterlockedAnd8, byte)();
+            t!(_InterlockedAnd16, short)();
+            t!(_interlockedand64, long)();
+
+            version (AArch64_Or_ARM)
+            {
+                t!(_InterlockedAnd_acq, int)();
+                t!(_InterlockedAnd_rel, int)();
+                t!(_InterlockedAnd_nf, int)();
+                t!(_InterlockedAnd8_acq, byte)();
+                t!(_InterlockedAnd8_rel, byte)();
+                t!(_InterlockedAnd8_nf, byte)();
+                t!(_InterlockedAnd16_acq, short)();
+                t!(_InterlockedAnd16_rel, short)();
+                t!(_InterlockedAnd16_nf, short)();
+                t!(_InterlockedAnd64_acq, long)();
+                t!(_InterlockedAnd64_rel, long)();
+                t!(_InterlockedAnd64_nf, long)();
+            }
+
+            version (X86_64_Or_AArch64_Or_ARM)
+            {
+                t!(_InterlockedAnd64, long)();
+            }
+
+            version (X86_64)
+            {
+                t!(_InterlockedAnd_np, int)();
+                t!(_InterlockedAnd8_np, byte)();
+                t!(_InterlockedAnd16_np, short)();
+                t!(_InterlockedAnd64_np, long)();
+                t!(_InterlockedAnd64_HLEAcquire, long)();
+                t!(_InterlockedAnd64_HLERelease, long)();
+            }
+
+            version (X86_64_Or_X86)
+            {
+                t!(_InterlockedAnd_HLEAcquire, int)();
+                t!(_InterlockedAnd_HLERelease, int)();
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ubyte _interlockedbittestandreset(scope shared(int)* a, int b) @system pure nothrow @nogc
+    {
+        return interlockedBitTestOp!("btr", "rmw_and", "and_4", "&", "~")(a, b);
+    }
+
+    version (X86_64_Or_AArch64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandreset64(scope shared(long)* a, long b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("btr", "rmw_and", "and_8", "&", "~")(a, b);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandreset_HLEAcquire(scope shared(int)* a, int b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("btr", "rmw_and", "and_4", "&", "~", MemoryOrder.seq, 1)(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandreset_HLERelease(scope shared(int)* a, int b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("btr", "rmw_and", "and_4", "&", "~", MemoryOrder.seq, 2)(a, b);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandreset64_HLEAcquire(scope shared(long)* a, long b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("btr", "rmw_and", "and_8", "&", "~", MemoryOrder.seq, 1)(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandreset64_HLERelease(scope shared(long)* a, long b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("btr", "rmw_and", "and_8", "&", "~", MemoryOrder.seq, 2)(a, b);
+        }
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandreset_acq(scope shared(int)* a, int b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("btr", "rmw_and", "and_4", "&", "~", MemoryOrder.acq)(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandreset_rel(scope shared(int)* a, int b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("btr", "rmw_and", "and_4", "&", "~", MemoryOrder.acq_rel)(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandreset_nf(scope shared(int)* a, int b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("btr", "rmw_and", "and_4", "&", "~", MemoryOrder.raw)(a, b);
+        }
+    }
+
+    version (AArch64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandreset64_acq(scope shared(long)* a, long b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("btr", "rmw_and", "and_8", "&", "~", MemoryOrder.acq)(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandreset64_rel(scope shared(long)* a, long b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("btr", "rmw_and", "and_8", "&", "~", MemoryOrder.acq_rel)(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandreset64_nf(scope shared(long)* a, long b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("btr", "rmw_and", "and_8", "&", "~", MemoryOrder.raw)(a, b);
+        }
+    }
+
+    @system pure nothrow @nogc unittest
+    {
+        enum ulong datumA = 0b0111111110100010110111000101011101001111001100111111101100010100;
+        enum ulong datumB = 0b0001001000011101110011000010011010101000101000111001000001101110;
+        enum ulong datumC = 0b1010010101000100010111111111000100001000010010111000100111100110;
+        enum ulong datumD = 0b1011110000010110101001111110000110000011001100101010111100011101;
+
+        static void bitResetTest(alias btr, T)()
+        {
+            scope shared(T)[4] data = [cast(T) datumA, cast(T) datumB, cast(T) datumC, cast(T) datumD];
+
+            assert(btr(&data[0], T(0)) == 0);
+            assert(data[0] == cast(T) 0b0111111110100010110111000101011101001111001100111111101100010100);
+
+            assert(btr(&data[0], T(2)) == 1);
+            assert(data[0] == cast(T) 0b0111111110100010110111000101011101001111001100111111101100010000);
+
+            assert(btr(&data[0], cast(T) ((T.sizeof << 3) * 3)) == 1);
+            assert(data[3] == cast(T) 0b1011110000010110101001111110000110000011001100101010111100011100);
+
+            assert(btr(&data[0], cast(T) ((T.sizeof << 3) * 3 + 1)) == 0);
+            assert(data[3] == cast(T) 0b1011110000010110101001111110000110000011001100101010111100011100);
+        }
+
+        static bool test()
+        {
+            bitResetTest!(_interlockedbittestandreset, int)();
+
+            version (X86_64_Or_AArch64)
+            {
+                bitResetTest!(_interlockedbittestandreset64, long)();
+            }
+
+            version (X86_64_Or_X86)
+            {
+                bitResetTest!(_interlockedbittestandreset_HLEAcquire, int)();
+                bitResetTest!(_interlockedbittestandreset_HLERelease, int)();
+            }
+
+            version (X86_64)
+            {
+                bitResetTest!(_interlockedbittestandreset64_HLEAcquire, long)();
+                bitResetTest!(_interlockedbittestandreset64_HLERelease, long)();
+            }
+
+            version (AArch64_Or_ARM)
+            {
+                bitResetTest!(_interlockedbittestandreset_acq, int)();
+                bitResetTest!(_interlockedbittestandreset_rel, int)();
+                bitResetTest!(_interlockedbittestandreset_nf, int)();
+            }
+
+            version (AArch64)
+            {
+                bitResetTest!(_interlockedbittestandreset64_acq, long)();
+                bitResetTest!(_interlockedbittestandreset64_rel, long)();
+                bitResetTest!(_interlockedbittestandreset64_nf, long)();
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ubyte _interlockedbittestandset(scope shared(int)* a, int b) @system pure nothrow @nogc
+    {
+        return interlockedBitTestOp!("bts", "rmw_or", "or_4", "|", "")(a, b);
+    }
+
+    version (X86_64_Or_AArch64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandset64(scope shared(long)* a, long b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("bts", "rmw_or", "or_8", "|", "")(a, b);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandset_HLEAcquire(scope shared(int)* a, int b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("bts", "rmw_or", "or_4", "|", "", MemoryOrder.seq, 1)(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandset_HLERelease(scope shared(int)* a, int b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("bts", "rmw_or", "or_4", "|", "", MemoryOrder.seq, 2)(a, b);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandset64_HLEAcquire(scope shared(long)* a, long b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("bts", "rmw_or", "or_8", "|", "", MemoryOrder.seq, 1)(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandset64_HLERelease(scope shared(long)* a, long b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("bts", "rmw_or", "or_8", "|", "", MemoryOrder.seq, 2)(a, b);
+        }
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandset_acq(scope shared(int)* a, int b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("bts", "rmw_or", "or_4", "|", "", MemoryOrder.acq)(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandset_rel(scope shared(int)* a, int b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("bts", "rmw_or", "or_4", "|", "", MemoryOrder.acq_rel)(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandset_nf(scope shared(int)* a, int b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("bts", "rmw_or", "or_4", "|", "", MemoryOrder.raw)(a, b);
+        }
+    }
+
+    version (AArch64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandset64_acq(scope shared(long)* a, long b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("bts", "rmw_or", "or_8", "|", "", MemoryOrder.acq)(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandset64_rel(scope shared(long)* a, long b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("bts", "rmw_or", "or_8", "|", "", MemoryOrder.acq_rel)(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _interlockedbittestandset64_nf(scope shared(long)* a, long b) @system pure nothrow @nogc
+        {
+            return interlockedBitTestOp!("bts", "rmw_or", "or_8", "|", "", MemoryOrder.raw)(a, b);
+        }
+    }
+
+    @system pure nothrow @nogc unittest
+    {
+        enum ulong datumA = 0b0111111110100010110111000101011101001111001100111111101100010100;
+        enum ulong datumB = 0b0001001000011101110011000010011010101000101000111001000001101110;
+        enum ulong datumC = 0b1010010101000100010111111111000100001000010010111000100111100110;
+        enum ulong datumD = 0b1011110000010110101001111110000110000011001100101010111100011101;
+
+        static void bitSetTest(alias bts, T)()
+        {
+            scope shared(T)[4] data = [cast(T) datumA, cast(T) datumB, cast(T) datumC, cast(T) datumD];
+
+            assert(bts(&data[0], T(0)) == 0);
+            assert(data[0] == cast(T) 0b0111111110100010110111000101011101001111001100111111101100010101);
+
+            assert(bts(&data[0], T(2)) == 1);
+            assert(data[0] == cast(T) 0b0111111110100010110111000101011101001111001100111111101100010101);
+
+            assert(bts(&data[0], cast(T) ((T.sizeof << 3) * 3)) == 1);
+            assert(data[3] == cast(T) 0b1011110000010110101001111110000110000011001100101010111100011101);
+
+            assert(bts(&data[0], cast(T) ((T.sizeof << 3) * 3 + 1)) == 0);
+            assert(data[3] == cast(T) 0b1011110000010110101001111110000110000011001100101010111100011111);
+        }
+
+        static bool test()
+        {
+            bitSetTest!(_interlockedbittestandset, int)();
+
+            version (X86_64_Or_AArch64)
+            {
+                bitSetTest!(_interlockedbittestandset64, long)();
+            }
+
+            version (X86_64_Or_X86)
+            {
+                bitSetTest!(_interlockedbittestandset_HLEAcquire, int)();
+                bitSetTest!(_interlockedbittestandset_HLERelease, int)();
+            }
+
+            version (X86_64)
+            {
+                bitSetTest!(_interlockedbittestandset64_HLEAcquire, long)();
+                bitSetTest!(_interlockedbittestandset64_HLERelease, long)();
+            }
+
+            version (AArch64_Or_ARM)
+            {
+                bitSetTest!(_interlockedbittestandset_acq, int)();
+                bitSetTest!(_interlockedbittestandset_rel, int)();
+                bitSetTest!(_interlockedbittestandset_nf, int)();
+            }
+
+            version (AArch64)
+            {
+                bitSetTest!(_interlockedbittestandset64_acq, long)();
+                bitSetTest!(_interlockedbittestandset64_rel, long)();
+                bitSetTest!(_interlockedbittestandset64_nf, long)();
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    int _InterlockedCompareExchange(scope shared(int)* Destination, int Exchange, int Comparand)
+    @safe pure nothrow @nogc
+    {
+        return interlockedCAS(Destination, Exchange, Comparand);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    byte _InterlockedCompareExchange8(scope shared(byte)* Destination, byte Exchange, byte Comparand)
+    @safe pure nothrow @nogc
+    {
+        return interlockedCAS(Destination, Exchange, Comparand);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    short _InterlockedCompareExchange16(scope shared(short)* Destination, short Exchange, short Comparand)
+    @safe pure nothrow @nogc
+    {
+        return interlockedCAS(Destination, Exchange, Comparand);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    long _InterlockedCompareExchange64(scope shared(long)* Destination, long Exchange, long Comparand)
+    @safe pure nothrow @nogc
+    {
+        return interlockedCAS(Destination, Exchange, Comparand);
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedCompareExchange_HLEAcquire(scope shared(int)* Destination, int Exchange, int Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCASHLE!true(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedCompareExchange_HLERelease(scope shared(int)* Destination, int Exchange, int Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCASHLE!false(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedCompareExchange64_HLEAcquire(scope shared(long)* Destination, long Exchange, long Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCASHLE!true(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedCompareExchange64_HLERelease(scope shared(long)* Destination, long Exchange, long Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCASHLE!false(Destination, Exchange, Comparand);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedCompareExchange_np(scope shared(int)* Destination, int Exchange, int Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedCompareExchange16_np(scope shared(short)* Destination, short Exchange, short Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedCompareExchange64_np(scope shared(long)* Destination, long Exchange, long Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS(Destination, Exchange, Comparand);
+        }
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedCompareExchange_acq(scope shared(int)* Destination, int Exchange, int Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.acq)(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedCompareExchange_rel(scope shared(int)* Destination, int Exchange, int Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.acq_rel, MemoryOrder.raw)(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedCompareExchange_nf(scope shared(int)* Destination, int Exchange, int Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.raw)(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedCompareExchange8_acq(scope shared(byte)* Destination, byte Exchange, byte Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.acq)(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedCompareExchange8_rel(scope shared(byte)* Destination, byte Exchange, byte Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.acq_rel, MemoryOrder.raw)(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedCompareExchange8_nf(scope shared(byte)* Destination, byte Exchange, byte Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.raw)(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedCompareExchange16_acq(scope shared(short)* Destination, short Exchange, short Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.acq)(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedCompareExchange16_rel(scope shared(short)* Destination, short Exchange, short Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.acq_rel, MemoryOrder.raw)(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedCompareExchange16_nf(scope shared(short)* Destination, short Exchange, short Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.raw)(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedCompareExchange64_acq(scope shared(long)* Destination, long Exchange, long Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.acq)(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedCompareExchange64_rel(scope shared(long)* Destination, long Exchange, long Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.acq_rel, MemoryOrder.raw)(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedCompareExchange64_nf(scope shared(long)* Destination, long Exchange, long Comparand)
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.raw)(Destination, Exchange, Comparand);
+        }
+    }
+
+    /* This is trusted so that it's @safe without DIP1000 enabled. */
+    @trusted pure nothrow @nogc unittest
+    {
+        static void compareExchangeTest(alias symbol, T)()
+        {
+            shared T value = cast(T) 0x6B2E38BF9FAF53EC;
+
+            assert(symbol(&value, value, value) == cast(T) 0x6B2E38BF9FAF53EC);
+            assert(value == cast(T) 0x6B2E38BF9FAF53EC);
+
+            assert(symbol(&value, cast(T) 0x24AC9053985CF040, value) == cast(T) 0x6B2E38BF9FAF53EC);
+            assert(value == cast(T) 0x24AC9053985CF040);
+
+            assert(symbol(&value, cast(T) 0x426A6F348BBD3430, 123) == cast(T) 0x24AC9053985CF040);
+            assert(value == cast(T) 0x24AC9053985CF040);
+        }
+
+        static bool test()
+        {
+            compareExchangeTest!(_InterlockedCompareExchange, int)();
+            compareExchangeTest!(_InterlockedCompareExchange8, byte)();
+            compareExchangeTest!(_InterlockedCompareExchange16, short)();
+            compareExchangeTest!(_InterlockedCompareExchange64, long)();
+
+            version (X86_64_Or_X86)
+            {
+                compareExchangeTest!(_InterlockedCompareExchange_HLEAcquire, int)();
+                compareExchangeTest!(_InterlockedCompareExchange_HLERelease, int)();
+                compareExchangeTest!(_InterlockedCompareExchange64_HLEAcquire, long)();
+                compareExchangeTest!(_InterlockedCompareExchange64_HLERelease, long)();
+            }
+
+            version (X86_64)
+            {
+                compareExchangeTest!(_InterlockedCompareExchange_np, int)();
+                compareExchangeTest!(_InterlockedCompareExchange16_np, short)();
+                compareExchangeTest!(_InterlockedCompareExchange64_np, long)();
+            }
+
+            version (AArch64_Or_ARM)
+            {
+                compareExchangeTest!(_InterlockedCompareExchange_acq, int)();
+                compareExchangeTest!(_InterlockedCompareExchange_rel, int)();
+                compareExchangeTest!(_InterlockedCompareExchange_nf, int)();
+                compareExchangeTest!(_InterlockedCompareExchange8_acq, byte)();
+                compareExchangeTest!(_InterlockedCompareExchange8_rel, byte)();
+                compareExchangeTest!(_InterlockedCompareExchange8_nf, byte)();
+                compareExchangeTest!(_InterlockedCompareExchange16_acq, short)();
+                compareExchangeTest!(_InterlockedCompareExchange16_rel, short)();
+                compareExchangeTest!(_InterlockedCompareExchange16_nf, short)();
+                compareExchangeTest!(_InterlockedCompareExchange64_acq, long)();
+                compareExchangeTest!(_InterlockedCompareExchange64_rel, long)();
+                compareExchangeTest!(_InterlockedCompareExchange64_nf, long)();
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    version (X86_64_Or_AArch64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _InterlockedCompareExchange128(
+            scope shared(long)* Destination,
+            long ExchangeHigh,
+            long ExchangeLow,
+            scope long* ComparandResult
+        ) @system pure nothrow @nogc
+        {
+            return interlockedCAS128(Destination, ExchangeHigh, ExchangeLow, ComparandResult);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _InterlockedCompareExchange128_np(
+            scope shared(long)* Destination,
+            long ExchangeHigh,
+            long ExchangeLow,
+            scope long* ComparandResult
+        ) @system pure nothrow @nogc
+        {
+            return interlockedCAS128(Destination, ExchangeHigh, ExchangeLow, ComparandResult);
+        }
+    }
+
+    version (AArch64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _InterlockedCompareExchange128_acq(
+            scope shared(long)* Destination,
+            long ExchangeHigh,
+            long ExchangeLow,
+            scope long* ComparandResult
+        ) @system pure nothrow @nogc
+        {
+            return interlockedCAS128!(MemoryOrder.acq)(Destination, ExchangeHigh, ExchangeLow, ComparandResult);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _InterlockedCompareExchange128_rel(
+            scope shared(long)* Destination,
+            long ExchangeHigh,
+            long ExchangeLow,
+            scope long* ComparandResult
+        ) @system pure nothrow @nogc
+        {
+            return interlockedCAS128!(MemoryOrder.acq_rel, MemoryOrder.raw)(
+                Destination,
+                ExchangeHigh,
+                ExchangeLow,
+                ComparandResult
+            );
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _InterlockedCompareExchange128_nf(
+            scope shared(long)* Destination,
+            long ExchangeHigh,
+            long ExchangeLow,
+            scope long* ComparandResult
+        ) @system pure nothrow @nogc
+        {
+            return interlockedCAS128!(MemoryOrder.raw)(Destination, ExchangeHigh, ExchangeLow, ComparandResult);
+        }
+    }
+
+    @system pure nothrow @nogc unittest
+    {
+        version (LittleEndian)
+        {
+            enum size_t lo = 0;
+            enum size_t hi = 1;
+        }
+        else version (BigEndian)
+        {
+            enum size_t lo = 1;
+            enum size_t hi = 0;
+        }
+
+        static void compareExchangeTest(alias symbol)()
+        {
+            shared scope long[2] value;
+            value[lo] = 0x6B2E38BF9FAF53EC;
+            value[hi] = 0x5E81D5FBA4340FD3;
+
+            scope long[2] expected = value;
+
+            assert(symbol(&value[0], value[hi], value[lo], &expected[0]) == 1);
+            assert(value[lo] == 0x6B2E38BF9FAF53EC);
+            assert(value[hi] == 0x5E81D5FBA4340FD3);
+            assert(expected[lo] == 0x6B2E38BF9FAF53EC);
+            assert(expected[hi] == 0x5E81D5FBA4340FD3);
+
+            assert(symbol(&value[0], 0x24AC9053985CF040, 0x936644BBF7E7DD76, &expected[0]) == 1);
+            assert(value[lo] == 0x936644BBF7E7DD76);
+            assert(value[hi] == 0x24AC9053985CF040);
+            assert(expected[lo] == 0x6B2E38BF9FAF53EC);
+            assert(expected[hi] == 0x5E81D5FBA4340FD3);
+
+            assert(symbol(&value[0], 0x6EEFACD4571F6679, 0xB2281F742F268665, &expected[0]) == 0);
+            assert(value[lo] == 0x936644BBF7E7DD76);
+            assert(value[hi] == 0x24AC9053985CF040);
+            assert(expected[lo] == 0x936644BBF7E7DD76);
+            assert(expected[hi] == 0x24AC9053985CF040);
+        }
+
+        static bool test()
+        {
+            version (X86_64_Or_AArch64)
+            {
+                compareExchangeTest!_InterlockedCompareExchange128();
+            }
+
+            version (X86_64)
+            {
+                compareExchangeTest!_InterlockedCompareExchange128_np();
+            }
+
+            version (AArch64)
+            {
+                compareExchangeTest!_InterlockedCompareExchange128_acq();
+                compareExchangeTest!_InterlockedCompareExchange128_rel();
+                compareExchangeTest!_InterlockedCompareExchange128_nf();
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    void* _InterlockedCompareExchangePointer(
+        scope shared(void*)* Destination,
+        scope void* Exchange,
+        return scope void* Comparand
+    )
+    @safe pure nothrow @nogc
+    {
+        return interlockedCAS!(MemoryOrder.seq, MemoryOrder.seq, void*)(Destination, Exchange, Comparand);
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        void* _InterlockedCompareExchangePointer_HLEAcquire(
+            scope shared(void*)* Destination,
+            scope void* Exchange,
+            return scope void* Comparand
+        )
+        @safe pure nothrow @nogc
+        {
+            return interlockedCASHLE!(true, void*)(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void* _InterlockedCompareExchangePointer_HLERelease(
+            scope shared(void*)* Destination,
+            scope void* Exchange,
+            return scope void* Comparand
+        )
+        @safe pure nothrow @nogc
+        {
+            return interlockedCASHLE!(false, void*)(Destination, Exchange, Comparand);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        void* _InterlockedCompareExchangePointer_np(
+            scope shared(void*)* Destination,
+            scope void* Exchange,
+            return scope void* Comparand
+        )
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.seq, MemoryOrder.seq, void*)(Destination, Exchange, Comparand);
+        }
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        void* _InterlockedCompareExchangePointer_acq(
+            scope shared(void*)* Destination,
+            scope void* Exchange,
+            return scope void* Comparand
+        )
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.acq, MemoryOrder.acq, void*)(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void* _InterlockedCompareExchangePointer_rel(
+            scope shared(void*)* Destination,
+            scope void* Exchange,
+            return scope void* Comparand
+        )
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.acq_rel, MemoryOrder.raw, void*)(Destination, Exchange, Comparand);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void* _InterlockedCompareExchangePointer_nf(
+            scope shared(void*)* Destination,
+            scope void* Exchange,
+            return scope void* Comparand
+        )
+        @safe pure nothrow @nogc
+        {
+            return interlockedCAS!(MemoryOrder.raw, MemoryOrder.raw, void*)(Destination, Exchange, Comparand);
+        }
+    }
+
+    @safe pure nothrow @nogc unittest
+    {
+        static void* p(ulong value) @trusted
+        {
+            return cast(void*) cast(size_t) value;
+        }
+
+        static void compareExchangeTest(alias symbol)()
+        {
+            scope void* value = p(0x6B2E38BF9FAF53EC);
+            scope shared(void*)* valueAddress = ((return scope ref v) @trusted => cast(shared(void*)*) &v)(value);
+
+            assert(symbol(valueAddress, value, value) == p(0x6B2E38BF9FAF53EC));
+            assert(value == p(0x6B2E38BF9FAF53EC));
+
+            assert(symbol(valueAddress, p(0x24AC9053985CF040), value) == p(0x6B2E38BF9FAF53EC));
+            assert(value == p(0x24AC9053985CF040));
+
+            assert(symbol(valueAddress, p(0x426A6F348BBD3430), p(123)) == p(0x24AC9053985CF040));
+            assert(value == p(0x24AC9053985CF040));
+        }
+
+        static bool test()
+        {
+            compareExchangeTest!_InterlockedCompareExchangePointer();
+
+            version (X86_64_Or_X86)
+            {
+                compareExchangeTest!_InterlockedCompareExchangePointer_HLEAcquire();
+                compareExchangeTest!_InterlockedCompareExchangePointer_HLERelease();
+            }
+
+            version (X86_64)
+            {
+                compareExchangeTest!_InterlockedCompareExchangePointer_np();
+            }
+
+            version (AArch64_Or_ARM)
+            {
+                compareExchangeTest!_InterlockedCompareExchangePointer_acq();
+                compareExchangeTest!_InterlockedCompareExchangePointer_rel();
+                compareExchangeTest!_InterlockedCompareExchangePointer_nf();
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    int _InterlockedDecrement(scope shared(int)* lpAddend) @safe pure nothrow @nogc
+    {
+        return interlockedAdd(lpAddend, -1);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    short _InterlockedDecrement16(scope shared(short)* lpAddend) @safe pure nothrow @nogc
+    {
+        return interlockedAdd(lpAddend, -1);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    long _interlockeddecrement64(scope shared(long)* lpAddend) @safe pure nothrow @nogc
+    {
+        import core.internal.atomic : atomicFetchAdd;
+
+        static if (__traits(compiles, atomicFetchAdd(lpAddend, -1)))
+        {
+            if (__ctfe)
+            {
+                return *((a) @trusted => cast(long*) a)(lpAddend) += -1;
+            }
+            else
+            {
+                return atomicFetchAdd(lpAddend, -1) - 1;
+            }
+        }
+        else
+        {
+            return interlockedOp!("rmw_add", "add_8", "+", MemoryOrder.seq, true)(lpAddend, -1) - 1;
+        }
+    }
+
+    version (X86_64_Or_AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedDecrement64(scope shared(long)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd(lpAddend, -1);
+        }
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedDecrement_acq(scope shared(int)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq)(lpAddend, -1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedDecrement_rel(scope shared(int)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq_rel)(lpAddend, -1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedDecrement_nf(scope shared(int)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.raw)(lpAddend, -1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedDecrement16_acq(scope shared(short)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq)(lpAddend, -1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedDecrement16_rel(scope shared(short)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq_rel)(lpAddend, -1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedDecrement16_nf(scope shared(short)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.raw)(lpAddend, -1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedDecrement64_acq(scope shared(long)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq)(lpAddend, -1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedDecrement64_rel(scope shared(long)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq_rel)(lpAddend, -1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedDecrement64_nf(scope shared(long)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.raw)(lpAddend, -1);
+        }
+    }
+
+    /* This is trusted so that it's @safe without DIP1000 enabled. */
+    @trusted pure nothrow @nogc unittest
+    {
+        static void decrementTest(alias symbol, T)()
+        {
+            shared T value = 1;
+
+            assert(symbol(&value) == 0);
+            assert(value == 0);
+
+            assert(symbol(&value) == -1);
+            assert(value == -1);
+        }
+
+        static bool test()
+        {
+            decrementTest!(_InterlockedDecrement, int)();
+            decrementTest!(_InterlockedDecrement16, short)();
+            decrementTest!(_interlockeddecrement64, long)();
+
+            version (X86_64_Or_AArch64_Or_ARM)
+            {
+                decrementTest!(_InterlockedDecrement64, long)();
+            }
+
+            version (AArch64_Or_ARM)
+            {
+                decrementTest!(_InterlockedDecrement_acq, int)();
+                decrementTest!(_InterlockedDecrement_rel, int)();
+                decrementTest!(_InterlockedDecrement_nf, int)();
+                decrementTest!(_InterlockedDecrement16_acq, short)();
+                decrementTest!(_InterlockedDecrement16_rel, short)();
+                decrementTest!(_InterlockedDecrement16_nf, short)();
+                decrementTest!(_InterlockedDecrement64_acq, long)();
+                decrementTest!(_InterlockedDecrement64_rel, long)();
+                decrementTest!(_InterlockedDecrement64_nf, long)();
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    int _InterlockedExchange(scope shared(int)* Target, int Value) @safe pure nothrow @nogc
+    {
+        return interlockedExchange(Target, Value);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    byte _InterlockedExchange8(scope shared(byte)* Target, byte Value) @safe pure nothrow @nogc
+    {
+        return interlockedExchange(Target, Value);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    short _InterlockedExchange16(scope shared(short)* Target, short Value) @safe pure nothrow @nogc
+    {
+        return interlockedExchange(Target, Value);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    long _interlockedexchange64(scope shared(long)* Target, long Value) @trusted pure nothrow @nogc
+    {
+        static if (__traits(compiles, interlockedExchange(Target, Value)))
+        {
+            return interlockedExchange(Target, Value);
+        }
+        else
+        {
+            if (__ctfe)
+            {
+                long oldValue = *cast(long*) Target;
+                *cast(long*) Target = Value;
+                return oldValue;
+            }
+            else
+            {
+                import core.internal.atomic : atomicCompareExchangeWeak, atomicLoad;
+
+                long data = atomicLoad!(MemoryOrder.raw)(Target);
+
+                while (!atomicCompareExchangeWeak(cast(long*) Target, &data, Value))
+                {}
+
+                return data;
+            }
+        }
+    }
+
+    version (X86_64_Or_AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedExchange64(scope shared(long)* Target, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange(Target, Value);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedExchange_HLEAcquire(scope shared(int)* Target, int Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeHLE!true(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedExchange_HLERelease(scope shared(int)* Target, int Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeHLE!false(Target, Value);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedExchange64_HLEAcquire(scope shared(long)* Target, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeHLE!true(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedExchange64_HLERelease(scope shared(long)* Target, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeHLE!false(Target, Value);
+        }
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedExchange_acq(scope shared(int)* Target, int Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.acq)(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedExchange_rel(scope shared(int)* Target, int Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.acq_rel)(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedExchange_nf(scope shared(int)* Target, int Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.raw)(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedExchange8_acq(scope shared(byte)* Target, byte Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.acq)(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedExchange8_rel(scope shared(byte)* Target, byte Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.acq_rel)(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedExchange8_nf(scope shared(byte)* Target, byte Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.raw)(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedExchange16_acq(scope shared(short)* Target, short Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.acq)(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedExchange16_rel(scope shared(short)* Target, short Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.acq_rel)(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedExchange16_nf(scope shared(short)* Target, short Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.raw)(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedExchange64_acq(scope shared(long)* Target, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.acq)(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedExchange64_rel(scope shared(long)* Target, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.acq_rel)(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedExchange64_nf(scope shared(long)* Target, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.raw)(Target, Value);
+        }
+    }
+
+    /* This is trusted so that it's @safe without DIP1000 enabled. */
+    @trusted pure nothrow @nogc unittest
+    {
+        static void exchangeTest(alias symbol, T)()
+        {
+            shared T value = cast(T) 0x0790C852D0938C7B;
+
+            assert(symbol(&value, cast(T) 0x612396D4FDC2C66A) == cast(T) 0x0790C852D0938C7B);
+            assert(value == cast(T) 0x612396D4FDC2C66A);
+
+            assert(symbol(&value, cast(T) 0xAA6C3899EABBE818) == cast(T) 0x612396D4FDC2C66A);
+            assert(value == cast(T) 0xAA6C3899EABBE818);
+        }
+
+        static bool test()
+        {
+            exchangeTest!(_InterlockedExchange, int)();
+            exchangeTest!(_InterlockedExchange8, byte)();
+            exchangeTest!(_InterlockedExchange16, short)();
+            exchangeTest!(_interlockedexchange64, long)();
+
+            version (X86_64_Or_AArch64_Or_ARM)
+            {
+                exchangeTest!(_InterlockedExchange64, long)();
+            }
+
+            version (X86_64_Or_X86)
+            {
+                exchangeTest!(_InterlockedExchange_HLEAcquire, int)();
+                exchangeTest!(_InterlockedExchange_HLERelease, int)();
+            }
+
+            version (X86_64)
+            {
+                exchangeTest!(_InterlockedExchange64_HLEAcquire, long)();
+                exchangeTest!(_InterlockedExchange64_HLERelease, long)();
+            }
+
+            version (AArch64_Or_ARM)
+            {
+                exchangeTest!(_InterlockedExchange_acq, int)();
+                exchangeTest!(_InterlockedExchange_rel, int)();
+                exchangeTest!(_InterlockedExchange_nf, int)();
+                exchangeTest!(_InterlockedExchange8_acq, byte)();
+                exchangeTest!(_InterlockedExchange8_rel, byte)();
+                exchangeTest!(_InterlockedExchange8_nf, byte)();
+                exchangeTest!(_InterlockedExchange16_acq, short)();
+                exchangeTest!(_InterlockedExchange16_rel, short)();
+                exchangeTest!(_InterlockedExchange16_nf, short)();
+                exchangeTest!(_InterlockedExchange64_acq, long)();
+                exchangeTest!(_InterlockedExchange64_rel, long)();
+                exchangeTest!(_InterlockedExchange64_nf, long)();
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    int _InterlockedExchangeAdd(scope shared(int)* Addend, int Value) @safe pure nothrow @nogc
+    {
+        return interlockedExchangeAdd(Addend, Value);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    byte _InterlockedExchangeAdd8(scope shared(byte)* Addend, byte Value) @safe pure nothrow @nogc
+    {
+        return interlockedExchangeAdd(Addend, Value);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    short _InterlockedExchangeAdd16(scope shared(short)* Addend, short Value) @safe pure nothrow @nogc
+    {
+        return interlockedExchangeAdd(Addend, Value);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    long _interlockedexchangeadd64(scope shared(long)* Addend, long Value) @trusted pure nothrow @nogc
+    {
+        static if (__traits(compiles, interlockedExchangeAdd(Addend, Value)))
+        {
+            return interlockedExchangeAdd(Addend, Value);
+        }
+        else
+        {
+            return interlockedOp!("rmw_add", "add_8", "+", MemoryOrder.seq, true)(Addend, Value);
+        }
+    }
+
+    version (X86_64_Or_AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedExchangeAdd64(scope shared(long)* Addend, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAdd(Addend, Value);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedExchangeAdd_HLEAcquire(scope shared(int)* Addend, int Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAddHLE!true(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedExchangeAdd_HLERelease(scope shared(int)* Addend, int Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAddHLE!false(Addend, Value);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedExchangeAdd64_HLEAcquire(scope shared(long)* Addend, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAddHLE!true(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedExchangeAdd64_HLERelease(scope shared(long)* Addend, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAddHLE!false(Addend, Value);
+        }
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedExchangeAdd_acq(scope shared(int)* Addend, int Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAdd!(MemoryOrder.acq)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedExchangeAdd_rel(scope shared(int)* Addend, int Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAdd!(MemoryOrder.acq_rel)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedExchangeAdd_nf(scope shared(int)* Addend, int Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAdd!(MemoryOrder.raw)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedExchangeAdd8_acq(scope shared(byte)* Addend, byte Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAdd!(MemoryOrder.acq)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedExchangeAdd8_rel(scope shared(byte)* Addend, byte Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAdd!(MemoryOrder.acq_rel)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedExchangeAdd8_nf(scope shared(byte)* Addend, byte Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAdd!(MemoryOrder.raw)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedExchangeAdd16_acq(scope shared(short)* Addend, short Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAdd!(MemoryOrder.acq)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedExchangeAdd16_rel(scope shared(short)* Addend, short Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAdd!(MemoryOrder.acq_rel)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedExchangeAdd16_nf(scope shared(short)* Addend, short Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAdd!(MemoryOrder.raw)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedExchangeAdd64_acq(scope shared(long)* Addend, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAdd!(MemoryOrder.acq)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedExchangeAdd64_rel(scope shared(long)* Addend, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAdd!(MemoryOrder.acq_rel)(Addend, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedExchangeAdd64_nf(scope shared(long)* Addend, long Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchangeAdd!(MemoryOrder.raw)(Addend, Value);
+        }
+    }
+
+    /* This is trusted so that it's @safe without DIP1000 enabled. */
+    @trusted pure nothrow @nogc unittest
+    {
+        static bool test()
+        {
+            alias t(alias symbol, T) = interlockedOpTest!("+", symbol, T);
+
+            t!(_InterlockedExchangeAdd, int)();
+            t!(_InterlockedExchangeAdd8, byte)();
+            t!(_InterlockedExchangeAdd16, short)();
+            t!(_interlockedexchangeadd64, long)();
+
+            version (X86_64_Or_AArch64_Or_ARM)
+            {
+                t!(_InterlockedExchangeAdd64, long)();
+            }
+
+            version (X86_64_Or_X86)
+            {
+                t!(_InterlockedExchangeAdd_HLEAcquire, int)();
+                t!(_InterlockedExchangeAdd_HLERelease, int)();
+            }
+
+            version (X86_64)
+            {
+                t!(_InterlockedExchangeAdd64_HLEAcquire, long)();
+                t!(_InterlockedExchangeAdd64_HLERelease, long)();
+            }
+
+            version (AArch64_Or_ARM)
+            {
+                t!(_InterlockedExchangeAdd_acq, int)();
+                t!(_InterlockedExchangeAdd_rel, int)();
+                t!(_InterlockedExchangeAdd_nf, int)();
+                t!(_InterlockedExchangeAdd8_acq, byte)();
+                t!(_InterlockedExchangeAdd8_rel, byte)();
+                t!(_InterlockedExchangeAdd8_nf, byte)();
+                t!(_InterlockedExchangeAdd16_acq, short)();
+                t!(_InterlockedExchangeAdd16_rel, short)();
+                t!(_InterlockedExchangeAdd16_nf, short)();
+                t!(_InterlockedExchangeAdd64_acq, long)();
+                t!(_InterlockedExchangeAdd64_rel, long)();
+                t!(_InterlockedExchangeAdd64_nf, long)();
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    void* _InterlockedExchangePointer(scope shared(void*)* Target, scope void* Value) @safe pure nothrow @nogc
+    {
+        return interlockedExchange!(MemoryOrder.seq, void*)(Target, Value);
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        void* _InterlockedExchangePointer_HLEAcquire(scope shared(void*)* Target, scope void* Value)
+        @safe pure nothrow @nogc
+        {
+            return interlockedExchangeHLE!(true, void*)(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void* _InterlockedExchangePointer_HLERelease(scope shared(void*)* Target, scope void* Value)
+        @safe pure nothrow @nogc
+        {
+            return interlockedExchangeHLE!(false, void*)(Target, Value);
+        }
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        void* _InterlockedExchangePointer_acq(scope shared(void*)* Target, scope void* Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.acq, void*)(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void* _InterlockedExchangePointer_rel(scope shared(void*)* Target, scope void* Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.acq_rel, void*)(Target, Value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void* _InterlockedExchangePointer_nf(scope shared(void*)* Target, scope void* Value) @safe pure nothrow @nogc
+        {
+            return interlockedExchange!(MemoryOrder.raw, void*)(Target, Value);
+        }
+    }
+
+    @safe pure nothrow @nogc unittest
+    {
+        static void* p(ulong value) @trusted
+        {
+            return cast(void*) cast(size_t) value;
+        }
+
+        static void exchangeTest(alias symbol)()
+        {
+            scope void* value = p(0x0790C852D0938C7B);
+            scope shared(void*)* valueAddress = ((return scope ref v) @trusted => cast(shared(void*)*) &v)(value);
+
+            assert(symbol(valueAddress, p(0x612396D4FDC2C66A)) == p(0x0790C852D0938C7B));
+            assert(value == p(0x612396D4FDC2C66A));
+
+            assert(symbol(valueAddress, p(0xAA6C3899EABBE818)) == p(0x612396D4FDC2C66A));
+            assert(value == p(0xAA6C3899EABBE818));
+        }
+
+        static bool test()
+        {
+            exchangeTest!_InterlockedExchangePointer();
+
+            version (X86_64_Or_X86)
+            {
+                exchangeTest!_InterlockedExchangePointer_HLEAcquire();
+                exchangeTest!_InterlockedExchangePointer_HLERelease();
+            }
+
+            version (AArch64_Or_ARM)
+            {
+                exchangeTest!_InterlockedExchangePointer_acq();
+                exchangeTest!_InterlockedExchangePointer_rel();
+                exchangeTest!_InterlockedExchangePointer_nf();
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    int _InterlockedIncrement(scope shared(int)* lpAddend) @safe pure nothrow @nogc
+    {
+        return interlockedAdd(lpAddend, 1);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    short _InterlockedIncrement16(scope shared(short)* lpAddend) @safe pure nothrow @nogc
+    {
+        return interlockedAdd(lpAddend, 1);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    long _interlockedincrement64(scope shared(long)* lpAddend) @safe pure nothrow @nogc
+    {
+        import core.internal.atomic : atomicFetchAdd;
+
+        static if (__traits(compiles, atomicFetchAdd(lpAddend, 1)))
+        {
+            if (__ctfe)
+            {
+                return *((a) @trusted => cast(long*) a)(lpAddend) += 1;
+            }
+            else
+            {
+                return atomicFetchAdd(lpAddend, 1) + 1;
+            }
+        }
+        else
+        {
+            return interlockedOp!("rmw_add", "add_8", "+", MemoryOrder.seq, true)(lpAddend, 1) + 1;
+        }
+    }
+
+    version (X86_64_Or_AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedIncrement64(scope shared(long)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd(lpAddend, 1);
+        }
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedIncrement_acq(scope shared(int)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq)(lpAddend, 1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedIncrement_rel(scope shared(int)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq_rel)(lpAddend, 1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedIncrement_nf(scope shared(int)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.raw)(lpAddend, 1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedIncrement16_acq(scope shared(short)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq)(lpAddend, 1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedIncrement16_rel(scope shared(short)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq_rel)(lpAddend, 1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedIncrement16_nf(scope shared(short)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.raw)(lpAddend, 1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedIncrement64_acq(scope shared(long)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq)(lpAddend, 1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedIncrement64_rel(scope shared(long)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.acq_rel)(lpAddend, 1);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedIncrement64_nf(scope shared(long)* lpAddend) @safe pure nothrow @nogc
+        {
+            return interlockedAdd!(MemoryOrder.raw)(lpAddend, 1);
+        }
+    }
+
+    /* This is trusted so that it's @safe without DIP1000 enabled. */
+    @trusted pure nothrow @nogc unittest
+    {
+        static void incrementTest(alias symbol, T)()
+        {
+            shared T value = -2;
+
+            assert(symbol(&value) == -1);
+            assert(value == -1);
+
+            assert(symbol(&value) == 0);
+            assert(value == 0);
+
+            assert(symbol(&value) == 1);
+            assert(value == 1);
+        }
+
+        static bool test()
+        {
+            incrementTest!(_InterlockedIncrement, int)();
+            incrementTest!(_InterlockedIncrement16, short)();
+            incrementTest!(_interlockedincrement64, long)();
+
+            version (X86_64_Or_AArch64_Or_ARM)
+            {
+                incrementTest!(_InterlockedIncrement64, long)();
+            }
+
+            version (AArch64_Or_ARM)
+            {
+                incrementTest!(_InterlockedIncrement_acq, int)();
+                incrementTest!(_InterlockedIncrement_rel, int)();
+                incrementTest!(_InterlockedIncrement_nf, int)();
+                incrementTest!(_InterlockedIncrement16_acq, short)();
+                incrementTest!(_InterlockedIncrement16_rel, short)();
+                incrementTest!(_InterlockedIncrement16_nf, short)();
+                incrementTest!(_InterlockedIncrement64_acq, long)();
+                incrementTest!(_InterlockedIncrement64_rel, long)();
+                incrementTest!(_InterlockedIncrement64_nf, long)();
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    int _InterlockedOr(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+    {
+        return interlockedOp!("rmw_or", "or_4", "|")(value, mask);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    byte _InterlockedOr8(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+    {
+        return interlockedOp!("rmw_or", "or_1", "|")(value, mask);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    short _InterlockedOr16(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+    {
+        return interlockedOp!("rmw_or", "or_2", "|")(value, mask);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    long _interlockedor64(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+    {
+        return interlockedOp!("rmw_or", "or_8", "|")(value, mask);
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedOr_acq(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_4", "|", MemoryOrder.acq)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedOr_rel(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_4", "|", MemoryOrder.acq_rel)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedOr_nf(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_4", "|", MemoryOrder.raw)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedOr8_acq(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_1", "|", MemoryOrder.acq)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedOr8_rel(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_1", "|", MemoryOrder.acq_rel)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedOr8_nf(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_1", "|", MemoryOrder.raw)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedOr16_acq(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_2", "|", MemoryOrder.acq)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedOr16_rel(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_2", "|", MemoryOrder.acq_rel)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedOr16_nf(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_2", "|", MemoryOrder.raw)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedOr64_acq(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_8", "|", MemoryOrder.acq)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedOr64_rel(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_8", "|", MemoryOrder.acq_rel)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedOr64_nf(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_8", "|", MemoryOrder.raw)(value, mask);
+        }
+    }
+
+    version (X86_64_Or_AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedOr64(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_8", "|")(value, mask);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedOr_np(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_4", "|", MemoryOrder.seq, true)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedOr8_np(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_1", "|", MemoryOrder.seq, true)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedOr16_np(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_2", "|", MemoryOrder.seq, true)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedOr64_np(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_or", "or_8", "|", MemoryOrder.seq, true)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedOr64_HLEAcquire(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOpHLE!(true, "|", "or")(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedOr64_HLERelease(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOpHLE!(false, "|", "or")(value, mask);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedOr_HLEAcquire(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOpHLE!(true, "|", "or")(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedOr_HLERelease(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOpHLE!(false, "|", "or")(value, mask);
+        }
+    }
+
+    /* This is trusted so that it's @safe without DIP1000 enabled. */
+    @trusted pure nothrow @nogc unittest
+    {
+        static bool test()
+        {
+            alias t(alias symbol, T) = interlockedOpTest!("|", symbol, T);
+
+            t!(_InterlockedOr, int)();
+            t!(_InterlockedOr8, byte)();
+            t!(_InterlockedOr16, short)();
+            t!(_interlockedor64, long)();
+
+            version (AArch64_Or_ARM)
+            {
+                t!(_InterlockedOr_acq, int)();
+                t!(_InterlockedOr_rel, int)();
+                t!(_InterlockedOr_nf, int)();
+                t!(_InterlockedOr8_acq, byte)();
+                t!(_InterlockedOr8_rel, byte)();
+                t!(_InterlockedOr8_nf, byte)();
+                t!(_InterlockedOr16_acq, short)();
+                t!(_InterlockedOr16_rel, short)();
+                t!(_InterlockedOr16_nf, short)();
+                t!(_InterlockedOr64_acq, long)();
+                t!(_InterlockedOr64_rel, long)();
+                t!(_InterlockedOr64_nf, long)();
+            }
+
+            version (X86_64_Or_AArch64_Or_ARM)
+            {
+                t!(_InterlockedOr64, long)();
+            }
+
+            version (X86_64)
+            {
+                t!(_InterlockedOr_np, int)();
+                t!(_InterlockedOr8_np, byte)();
+                t!(_InterlockedOr16_np, short)();
+                t!(_InterlockedOr64_np, long)();
+                t!(_InterlockedOr64_HLEAcquire, long)();
+                t!(_InterlockedOr64_HLERelease, long)();
+            }
+
+            version (X86_64_Or_X86)
+            {
+                t!(_InterlockedOr_HLEAcquire, int)();
+                t!(_InterlockedOr_HLERelease, int)();
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    int _InterlockedXor(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+    {
+        return interlockedOp!("rmw_xor", "xor_4", "^")(value, mask);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    byte _InterlockedXor8(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+    {
+        return interlockedOp!("rmw_xor", "xor_1", "^")(value, mask);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    short _InterlockedXor16(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+    {
+        return interlockedOp!("rmw_xor", "xor_2", "^")(value, mask);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    long _interlockedxor64(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+    {
+        return interlockedOp!("rmw_xor", "xor_8", "^")(value, mask);
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedXor_acq(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_4", "^", MemoryOrder.acq)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedXor_rel(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_4", "^", MemoryOrder.acq_rel)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedXor_nf(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_4", "^", MemoryOrder.raw)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedXor8_acq(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_1", "^", MemoryOrder.acq)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedXor8_rel(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_1", "^", MemoryOrder.acq_rel)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedXor8_nf(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_1", "^", MemoryOrder.raw)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedXor16_acq(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_2", "^", MemoryOrder.acq)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedXor16_rel(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_2", "^", MemoryOrder.acq_rel)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedXor16_nf(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_2", "^", MemoryOrder.raw)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedXor64_acq(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_8", "^", MemoryOrder.acq)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedXor64_rel(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_8", "^", MemoryOrder.acq_rel)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedXor64_nf(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_8", "^", MemoryOrder.raw)(value, mask);
+        }
+    }
+
+    version (X86_64_Or_AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedXor64(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_8", "^")(value, mask);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedXor_np(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_4", "^", MemoryOrder.seq, true)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        byte _InterlockedXor8_np(scope shared(byte)* value, byte mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_1", "^", MemoryOrder.seq, true)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        short _InterlockedXor16_np(scope shared(short)* value, short mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_2", "^", MemoryOrder.seq, true)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedXor64_np(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOp!("rmw_xor", "xor_8", "^", MemoryOrder.seq, true)(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedXor64_HLEAcquire(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOpHLE!(true, "^", "xor")(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _InterlockedXor64_HLERelease(scope shared(long)* value, long mask) @safe pure nothrow @nogc
+        {
+            return interlockedOpHLE!(false, "^", "xor")(value, mask);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedXor_HLEAcquire(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOpHLE!(true, "^", "xor")(value, mask);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        int _InterlockedXor_HLERelease(scope shared(int)* value, int mask) @safe pure nothrow @nogc
+        {
+            return interlockedOpHLE!(false, "^", "xor")(value, mask);
+        }
+    }
+
+    /* This is trusted so that it's @safe without DIP1000 enabled. */
+    @trusted pure nothrow @nogc unittest
+    {
+        static bool test()
+        {
+            alias t(alias symbol, T) = interlockedOpTest!("^", symbol, T);
+
+            t!(_InterlockedXor, int)();
+            t!(_InterlockedXor8, byte)();
+            t!(_InterlockedXor16, short)();
+            t!(_interlockedxor64, long)();
+
+            version (AArch64_Or_ARM)
+            {
+                t!(_InterlockedXor_acq, int)();
+                t!(_InterlockedXor_rel, int)();
+                t!(_InterlockedXor_nf, int)();
+                t!(_InterlockedXor8_acq, byte)();
+                t!(_InterlockedXor8_rel, byte)();
+                t!(_InterlockedXor8_nf, byte)();
+                t!(_InterlockedXor16_acq, short)();
+                t!(_InterlockedXor16_rel, short)();
+                t!(_InterlockedXor16_nf, short)();
+                t!(_InterlockedXor64_acq, long)();
+                t!(_InterlockedXor64_rel, long)();
+                t!(_InterlockedXor64_nf, long)();
+            }
+
+            version (X86_64_Or_AArch64_Or_ARM)
+            {
+                t!(_InterlockedXor64, long)();
+            }
+
+            version (X86_64)
+            {
+                t!(_InterlockedXor_np, int)();
+                t!(_InterlockedXor8_np, byte)();
+                t!(_InterlockedXor16_np, short)();
+                t!(_InterlockedXor64_np, long)();
+                t!(_InterlockedXor64_HLEAcquire, long)();
+                t!(_InterlockedXor64_HLERelease, long)();
+            }
+
+            version (X86_64_Or_X86)
+            {
+                t!(_InterlockedXor_HLEAcquire, int)();
+                t!(_InterlockedXor_HLERelease, int)();
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    private T interlockedAdd(MemoryOrder order = MemoryOrder.seq, T)(scope shared(T)* address, T value)
+    @safe pure nothrow @nogc
+    {
+        if (__ctfe)
+        {
+            return *((a) @trusted => cast(T*) a)(address) += value;
+        }
+        else
+        {
+            import core.internal.atomic : atomicFetchAdd;
+
+            T result = cast(T) (atomicFetchAdd!order(address, value) + value);
+
+            version (AArch64_Or_ARM)
+            {
+                /* This is what the Interlocked MSVC intrinsics do. */
+                static if (order == MemoryOrder.acq)
+                {
+                    /* dmb ish */
+                    __builtin_arm_dmb(11);
+                }
+            }
+
+            return result;
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    private T interlockedExchangeAdd(MemoryOrder order = MemoryOrder.seq, T)(scope shared(T)* address, T value)
+    @safe pure nothrow @nogc
+    {
+        if (__ctfe)
+        {
+            scope a = ((a) @trusted => cast(T*) a)(address);
+            T oldValue = *a;
+            *a += value;
+            return oldValue;
+        }
+        else
+        {
+            import core.internal.atomic : atomicFetchAdd;
+
+            T result = atomicFetchAdd!order(address, value);
+
+            version (AArch64_Or_ARM)
+            {
+                /* This is what the Interlocked MSVC intrinsics do. */
+                static if (order == MemoryOrder.acq)
+                {
+                    /* dmb ish */
+                    __builtin_arm_dmb(11);
+                }
+            }
+
+            return result;
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    private ubyte interlockedBitTestOp(
+        string x86OpCode,
+        string ldcName,
+        string gdcName,
+        string op,
+        string unaryOp = "",
+        MemoryOrder order = MemoryOrder.seq,
+        uint x86HLE = 0,
+        T
+    )(scope shared(T)* address, T bitIndex) @system pure nothrow @nogc
+    {
+        static ubyte bitTestOpViaSoftware(scope shared(T)* address, T bitIndex)
+        {
+            import core.bitop : bsr, popcnt;
+
+            enum uint bitCount = T.sizeof << 3;
+            enum uint bitShift = bitCount.bsr;
+            enum T bitMask = bitCount - 1;
+
+            scope shared(T)* integer = address + (bitIndex >> bitShift);
+            const T mask = T(1) << (bitIndex & bitMask);
+
+            return (interlockedOp!(ldcName, gdcName, op, order)(integer, mixin(unaryOp, q{mask})) & mask) != 0;
+        }
+
+        if (__ctfe)
+        {
+            return bitTestOpViaSoftware(address, bitIndex);
+        }
+        else
+        {
+            version (X86_64_Or_X86)
+            {
+                import core.bitop : bsr;
+
+                enum size = T.sizeof.bsr;
+
+                version (LDC)
+                {
+                    import ldc.llvmasm : __ir_pure;
+
+                    enum type = ["i8", "i16", "i32", "i64"][size];
+                    enum x86Ptr = ["byte", "word", "dword", "qword"][size];
+                    enum imm = ["", "", "I", "J"][size];
+                    enum ptr = llvmIRPtr!type ~ " elementtype(" ~ type ~ ")";
+                    enum hlePrefix = x86HLE == 0 ? "" : (x86HLE == 1 ? "xacquire " : "xrelease ");
+
+                    return __ir_pure!(
+                        `%bitIsSet = call i8 asm sideeffect inteldialect
+                             "` ~ hlePrefix ~ `lock ` ~ x86OpCode ~ ` ` ~ x86Ptr ~ ` ptr $1, $2",
+                             "={@ccc},=*m,` ~ imm ~ `r,~{memory},~{flags}"
+                             (` ~ ptr ~ ` %0, ` ~ type ~ ` %1)
+                         ret i8 %bitIsSet;`,
+                        ubyte
+                    )(address, bitIndex);
+                }
+                else version (GNU)
+                {
+                    enum char suffix = "bwlq"[size];
+                    enum imm = ["Wb", "Ww", "I", "J"][size];
+                    enum hlePrefix = x86HLE == 0 ? "" : (x86HLE == 1 ? "xacquire " : "xrelease ");
+
+                    ubyte bitIsSet;
+
+                    mixin(
+                        `asm @system pure nothrow @nogc
+                         {
+                             "" ~ hlePrefix ~ "lock " ~ x86OpCode ~ suffix ~ " %2, %0"
+                             : "+m" (*address), "=@ccc" (bitIsSet)
+                             : "` ~ imm ~ `r" (bitIndex)
+                             : "memory", "cc";
+                         }`
+                    );
+
+                    return bitIsSet;
+                }
+                else version (InlineAsm_X86_64_Or_X86)
+                {
+                    enum d = ["DL", "DX", "EDX", "RDX"][size];
+                    enum ptr = ["byte", "word", "dword", "qword"][size];
+                    enum xacquire = "repne; ";
+                    enum xrelease = "rep; ";
+                    enum hlePrefix = x86HLE == 0 ? "" : (x86HLE == 1 ? xacquire : xrelease);
+
+                    version (D_InlineAsm_X86_64)
+                    {
+                        mixin(
+                            "asm pure nothrow @nogc
+                             {
+                                 /* RCX is address; RDX is bitIndex. */
+                                 naked;
+                                 " ~ hlePrefix ~ "lock; " ~ x86OpCode ~ " " ~ ptr ~ " ptr [RCX], " ~ d ~ ";
+                                 setc AL;
+                                 ret;
+                             }"
+                        );
+                    }
+                    else version (D_InlineAsm_X86)
+                    {
+                        mixin(
+                            "asm pure nothrow @nogc
+                             {
+                                 naked;
+                                 mov ECX, [ESP + 4]; /* address. */
+                                 mov EDX, [ESP + 8]; /* bitIndex. */
+                                 " ~ hlePrefix ~ "lock; " ~ x86OpCode ~ " " ~ ptr ~ " ptr [ECX], " ~ d ~ ";
+                                 setc AL;
+                                 ret;
+                             }"
+                        );
+                    }
+                }
+            }
+            else
+            {
+                return bitTestOpViaSoftware(address, bitIndex);
+            }
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        private T interlockedExchangeAddHLE(bool acquire, T)(scope shared(T)* address, scope T value)
+        {
+            if (__ctfe)
+            {
+                return interlockedExchangeAdd!(MemoryOrder.seq, T)(address, value);
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import core.bitop : bsr;
+                    import ldc.llvmasm : __ir_pure;
+
+                    enum size = T.sizeof.bsr;
+                    enum type = ["i8", "i16", "i32", "i64"][size];
+                    enum ptr = llvmIRPtr!type;
+
+                    return __ir_pure!(
+                        `%oldValue = call ` ~ type ~ ` asm sideeffect inteldialect
+                             "` ~ (acquire ? "xacquire" : "xrelease") ~ ` lock xadd $1, $0",
+                             "=r,=*m,0,~{memory},~{flags}"
+                             ( ` ~ ptr ~ ` elementtype(` ~ type ~ `)` ~ ` %0, ` ~ type ~ ` %1)
+
+                         ret ` ~ type ~ ` %oldValue`,
+                        T
+                    )(address, value);
+                }
+                else version (GNU)
+                {
+                    static if (acquire)
+                    {
+                        /* This is equivalent to GCC's __ATOMIC_HLE_ACQUIRE. */
+                        enum int hleModifier = 1 << 16;
+                    }
+                    else
+                    {
+                        /* This is equivalent to GCC's __ATOMIC_HLE_RELEASE. */
+                        enum int hleModifier = 1 << 17;
+                    }
+
+                    enum int hleOrder = MemoryOrder.seq | hleModifier;
+                    enum add = "__atomic_fetch_add_" ~ ('0' + T.sizeof);
+
+                    mixin(q{import gcc.builtins : }, add, q{;});
+
+                    return mixin(add)(address, value, hleOrder);
+                }
+                else version (InlineAsm_X86_64_Or_X86)
+                {
+                    import core.bitop : bsr;
+
+                    enum size = T.sizeof.bsr;
+                    enum xacquire = "repne";
+                    enum xrelease = "rep";
+
+                    version (D_InlineAsm_X86_64)
+                    {
+                        enum fullA = ["EAX", "EAX", "EAX", "RAX"][size];
+                        enum fullD = ["EDX", "EDX", "EDX", "RDX"][size];
+                        enum a = ["AL", "AX", "EAX", "RAX"][size];
+
+                        mixin(
+                            "asm @trusted pure nothrow @nogc
+                             {
+                                 /* RCX is address; RDX is value. */
+                                 naked;
+                                  mov " ~ fullA ~ ", " ~ fullD ~ ";
+                                 " ~ (acquire ? xacquire : xrelease) ~ "; lock; xadd [RCX], " ~ a ~ ";
+                                 ret;
+                             }"
+                        );
+                    }
+                    else version (D_InlineAsm_X86)
+                    {
+                        enum a = ["AL", "AX", "EAX"][size];
+
+                        mixin(
+                            "asm @trusted pure nothrow @nogc
+                             {
+                                 naked;
+                                 mov ECX, [ESP + 4]; /* address. */
+                                 mov EAX, [ESP + 8]; /* value. */
+                                 " ~ (acquire ? xacquire : xrelease) ~ "; lock; xadd [ECX], " ~ a ~ ";
+                                 ret;
+                             }"
+                        );
+                    }
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        private T interlockedOpHLE(bool acquire, string op, string x86OpCode, T)(scope shared(T)* address, T operand)
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        @trusted
+        {
+            if (__ctfe)
+            {
+                scope a = ((a) @trusted => cast(T*) a)(address);
+                T oldValue = *a;
+                mixin(q{*a }, op, q{= operand;});
+                return oldValue;
+            }
+            else
+            {
+                version (X86_64)
+                {
+                    version (LDC)
+                    {
+                        import core.simd : prefetch;
+                        prefetch!(true, 3)(((a) @trusted => cast(const(void)*) a)(address));
+                    }
+                    else version (GNU)
+                    {
+                        import gcc.builtins : __builtin_prefetch;
+                        __builtin_prefetch(((a) @trusted => cast(const(void)*) a)(address), 1, 3);
+                    }
+                }
+
+                version (LDC)
+                {
+                    import core.bitop : bsr;
+                    import core.internal.atomic : atomicLoad;
+                    import ldc.llvmasm : __ir_pure;
+
+                    enum size = T.sizeof.bsr;
+                    enum type = ["i8", "i16", "i32", "i64"][size];
+                    enum a = ["al", "ax", "eax", "rax"][size];
+
+                    enum ptr = llvmIRPtr!type;
+                    T value = atomicLoad!(MemoryOrder.raw)(address);
+
+                    while (
+                        !__ir_pure!(
+                            `%value = load ` ~ type ~ `, ` ~ ptr ~ ` %1
+
+                             %cas = call {` ~ type ~ `, i8} asm sideeffect inteldialect
+                                 "` ~ (acquire ? "xacquire" : "xrelease") ~ ` lock cmpxchg $1, $4",
+                                 "={` ~ a ~ `},=*m,={@ccz},0,r,~{memory},~{flags}"
+                                 (` ~ ptr ~ ` elementtype(` ~ type ~ `)` ~ ` %0, ` ~ type ~ ` %value, ` ~ type ~ ` %2)
+
+                             %oldValue = extractvalue {` ~ type ~ `, i8} %cas, 0
+                             %stored = extractvalue {` ~ type ~ `, i8} %cas, 1
+
+                             store ` ~ type ~ ` %oldValue, ` ~ ptr ~ ` %1
+                             ret i8 %stored`,
+                            ubyte
+                        )(address, &value, cast(T) (mixin(q{value }, op, q{ operand})))
+                    )
+                    {
+                        static if (acquire)
+                        {
+                            __builtin_ia32_pause();
+                        }
+                    }
+
+                    return value;
+                }
+                else version (GNU)
+                {
+                    static if (acquire)
+                    {
+                        /* This is equivalent to GCC's __ATOMIC_HLE_ACQUIRE. */
+                        enum int hleModifier = 1 << 16;
+                    }
+                    else
+                    {
+                        /* This is equivalent to GCC's __ATOMIC_HLE_RELEASE. */
+                        enum int hleModifier = 1 << 17;
+                    }
+
+                    enum int hleOrder = MemoryOrder.seq | hleModifier;
+                    enum cas = "__atomic_compare_exchange_" ~ ('0' + T.sizeof);
+                    enum load = "__atomic_load_" ~ ('0' + T.sizeof);
+
+                    mixin(q{import gcc.builtins : }, cas, q{, }, load, q{, __builtin_ia32_pause;});
+
+                    T value = mixin(load)(address, MemoryOrder.raw);
+
+                    while (
+                        !mixin(cas)(
+                            address,
+                            &value,
+                            cast(T) (mixin(q{value }, op, q{ operand})),
+                            true,
+                            hleOrder,
+                            hleOrder
+                        )
+                    )
+                    {
+                        static if (acquire)
+                        {
+                            __builtin_ia32_pause();
+                        }
+                    }
+
+                    return value;
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    import core.bitop : bsr;
+
+                    enum size = T.sizeof.bsr;
+                    enum fullA = ["EAX", "EAX", "EAX", "RAX"][size];
+                    enum fullR8 = ["R8D", "R8D", "R8D", "R8"][size];
+                    enum fullMOV = ["movzx", "movzx", "mov", "mov"][size];
+                    enum fastD = ["DL", "EDX", "EDX", "RDX"][size];
+                    enum fastR8 = ["R8B", "R8D", "R8D", "R8"][size];
+                    enum r8 = ["R8B", "R8W", "R8D", "R8"][size];
+                    enum ptr = ["byte", "word", "dword", "qword"][size];
+                    enum xacquire = "repne";
+                    enum xrelease = "rep";
+
+                    mixin(
+                        "asm @trusted pure nothrow @nogc
+                         {
+                             /* RCX is address; RDX is operand. */
+                             naked;
+                             prefetchw byte ptr [RCX];
+                             " ~ fullMOV ~ " " ~ fullA ~ ", " ~ ptr ~ " ptr [RCX];
+                         cas:
+                             mov " ~ fullR8 ~ ", " ~ fullA ~ ";
+                             " ~ x86OpCode ~ " " ~ fastR8 ~ ", " ~ fastD ~ ";
+                             " ~ (acquire ? xacquire : xrelease) ~ "; lock; cmpxchg [RCX], " ~ r8 ~ ";
+                             " ~ (
+                                   acquire
+                                 ? "je swapped;
+                                    rep; nop; /* pause */
+                                    jmp cas;"
+                                 : "jne cas;"
+                             ) ~ "
+                         swapped:
+                             ret;
+                         }"
+                    );
+                }
+                else version (D_InlineAsm_X86)
+                {
+                    import core.bitop : bsr;
+
+                    enum size = T.sizeof.bsr;
+                    enum fullA = ["EAX", "EAX", "EAX"][size];
+                    enum fullB = ["EBX", "EBX", "EBX"][size];
+                    enum fullMOV = ["movzx", "movzx", "mov"][size];
+                    enum fastB = ["BL", "EBX", "EBX"][size];
+                    enum fastD = ["DL", "EDX", "EDX"][size];
+                    enum b = ["BL", "BX", "EBX"][size];
+                    enum ptr = ["byte", "word", "dword"][size];
+                    enum xacquire = "repne";
+                    enum xrelease = "rep";
+
+                    mixin(
+                        "asm @trusted pure nothrow @nogc
+                         {
+                             naked;
+                             push EBX;
+                             mov ECX, [ESP + 8]; /* address. */
+                             " ~ fullMOV ~ " " ~ fullA ~ ", " ~ ptr ~ " ptr [ECX];
+                             mov EDX, [ESP + 12]; /* operand. */
+                         cas:
+                             mov " ~ fullB ~ ", " ~ fullA ~ ";
+                             " ~ x86OpCode ~ " " ~ fastB ~ ", " ~ fastD ~ ";
+                             " ~ (acquire ? xacquire : xrelease) ~ "; lock; cmpxchg [ECX], " ~ b ~ ";
+                             " ~ (
+                                   acquire
+                                 ? "je swapped;
+                                    rep; nop; /* pause */
+                                    jmp cas;"
+                                 : "jne cas;"
+                             ) ~ "
+                         swapped:
+                             pop EBX;
+                             ret;
+                         }"
+                    );
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        private T interlockedCASHLE(bool acquire, T)(
+            scope shared(T)* address,
+            scope T valueToSet,
+            return scope T expectedValue
+        ) @trusted
+        {
+            if (__ctfe)
+            {
+                return interlockedCAS!(MemoryOrder.seq, MemoryOrder.seq, T)(address, valueToSet, expectedValue);
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import core.bitop : bsr;
+                    import ldc.llvmasm : __ir_pure;
+
+                    enum size = T.sizeof.bsr;
+
+                    static if (is(T == P*, P))
+                    {
+                        enum type = llvmIRPtr!"i8";
+                    }
+                    else
+                    {
+                        enum type = ["i8", "i16", "i32", "i64"][size];
+                    }
+
+                    version (X86)
+                    {
+                        enum bool canUseCMPXCHG = T.sizeof <= 4;
+                    }
+                    else version (X86_64)
+                    {
+                        enum bool canUseCMPXCHG = true;
+                    }
+
+                    enum ptr = llvmIRPtr!type;
+
+                    static if (canUseCMPXCHG)
+                    {
+                        enum a = ["al", "ax", "eax", "rax"][size];
+
+                        return __ir_pure!(
+                            `%oldValue = call ` ~ type ~ ` asm sideeffect inteldialect
+                                 "` ~ (acquire ? "xacquire" : "xrelease") ~ ` lock cmpxchg $1, $3",
+                                 "={` ~ a ~ `},=*m,0,r,~{memory},~{flags}"
+                                 (` ~ ptr ~ ` elementtype(` ~ type ~ `)` ~ ` %0, ` ~ type ~ ` %2, ` ~ type ~ ` %1)
+
+                             ret ` ~ type ~ ` %oldValue`,
+                            T
+                        )(address, valueToSet, expectedValue);
+                    }
+                    else
+                    {
+                        uint lo;
+                        uint hi;
+
+                        return __ir_pure!(
+                            `%oldValue = call {i32, i32} asm sideeffect inteldialect
+                                 "` ~ (acquire ? "xacquire" : "xrelease") ~ ` lock cmpxchg8b $2",
+                                 "={eax},={edx},=*m,0,1,{ebx},{ecx},~{memory},~{flags}"
+                                 (` ~ ptr ~ ` elementtype(i64)` ~ ` %0, i32 %3, i32 %4, i32 %1, i32 %2)
+
+                             %lo32 = extractvalue {i32, i32} %oldValue, 0
+                             %hi32 = extractvalue {i32, i32} %oldValue, 1
+
+                             %lo = zext i32 %lo32 to i64
+                             %hi = zext i32 %hi32 to i64
+                             %hi64 = shl i64 %hi, 32
+                             %result = or i64 %hi64, %lo
+
+                             ret i64 %result`,
+                            T
+                        )(
+                            address,
+                            cast(uint) valueToSet,
+                            cast(uint) (valueToSet >>> 32),
+                            cast(uint) expectedValue,
+                            cast(uint) (expectedValue >>> 32)
+                        );
+                    }
+                }
+                else version (GNU)
+                {
+                    static if (acquire)
+                    {
+                        /* This is equivalent to GCC's __ATOMIC_HLE_ACQUIRE. */
+                        enum int hleModifier = 1 << 16;
+                    }
+                    else
+                    {
+                        /* This is equivalent to GCC's __ATOMIC_HLE_RELEASE. */
+                        enum int hleModifier = 1 << 17;
+                    }
+
+                    enum int hleOrder = MemoryOrder.seq | hleModifier;
+                    enum cas = "__atomic_compare_exchange_" ~ ('0' + T.sizeof);
+
+                    import core.internal.traits : AliasSeq;
+                    import core.bitop : bsr;
+                    mixin(q{import gcc.builtins : }, cas, q{;});
+
+                    alias Int = AliasSeq!(ubyte, ushort, uint, ulong)[T.sizeof.bsr];
+
+                    cast(void) mixin(cas)(
+                        address,
+                        cast(Int*) &expectedValue,
+                        cast(Int) valueToSet,
+                        false,
+                        hleOrder,
+                        hleOrder
+                    );
+
+                    return expectedValue;
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    import core.bitop : bsr;
+
+                    enum size = T.sizeof.bsr;
+                    enum fullA = ["EAX", "EAX", "EAX", "RAX"][size];
+                    enum fullR8 = ["R8D", "R8D", "R8D", "R8"][size];
+                    enum d = ["DL", "DX", "EDX", "RDX"][size];
+                    enum xacquire = "repne";
+                    enum xrelease = "rep";
+
+                    mixin(
+                        "asm @trusted pure nothrow @nogc
+                         {
+                             /* RCX is address; RDX is valueToSet; R8 is expectedValue. */
+                             naked;
+                              mov " ~ fullA ~ ", " ~ fullR8 ~ ";
+                             " ~ (acquire ? xacquire : xrelease) ~ "; lock; cmpxchg [RCX], " ~ d ~ ";
+                             ret;
+                         }"
+                    );
+                }
+                else version (D_InlineAsm_X86)
+                {
+                    enum xacquire = "repne";
+                    enum xrelease = "rep";
+
+                    static if (T.sizeof <= 4)
+                    {
+                        import core.bitop : bsr;
+
+                        enum size = T.sizeof.bsr;
+                        enum d = ["DL", "DX", "EDX"][size];
+
+                        mixin(
+                            "asm @trusted pure nothrow @nogc
+                             {
+                                 naked;
+                                 mov ECX, [ESP +  4]; /* address. */
+                                 mov EDX, [ESP +  8]; /* valueToSet. */
+                                 mov EAX, [ESP + 12]; /* expectedValue. */
+                                 " ~ (acquire ? xacquire : xrelease) ~ "; lock; cmpxchg [ECX], " ~ d ~ ";
+                                 ret;
+                             }"
+                        );
+                    }
+                    else static if (T.sizeof <= 8)
+                    {
+                        mixin(
+                            "asm @trusted pure nothrow @nogc
+                             {
+                                 naked;
+                                 push ESI;
+                                 push EBX;
+                                 mov ESI, [ESP + 12]; /* address. */
+                                 mov ECX, [ESP + 20]; /* High half of valueToSet. */
+                                 mov EBX, [ESP + 16]; /* Low half of valueToSet. */
+                                 mov EDX, [ESP + 28]; /* High half of expectedValue. */
+                                 mov EAX, [ESP + 24]; /* Low half of expectedValue. */
+                                 " ~ (acquire ? xacquire : xrelease) ~ "; lock; cmpxchg8b [ESI];
+                                 pop EBX;
+                                 pop ESI;
+                                 ret;
+                             }"
+                        );
+                    }
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        private T interlockedExchangeHLE(bool acquire, T)(scope shared(T)* address, scope T value)
+        @trusted
+        {
+            if (__ctfe)
+            {
+                T oldValue = *cast(T*) address;
+                *cast(T*) address = value;
+                return oldValue;
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import core.bitop : bsr;
+                    import ldc.llvmasm : __ir_pure;
+
+                    enum size = T.sizeof.bsr;
+
+                    static if (is(T == P*, P))
+                    {
+                        enum type = llvmIRPtr!"i8";
+                    }
+                    else
+                    {
+                        enum type = ["i8", "i16", "i32", "i64"][size];
+                    }
+
+                    enum ptr = llvmIRPtr!type;
+
+                    return __ir_pure!(
+                        `%oldValue = call ` ~ type ~ ` asm sideeffect inteldialect
+                             "` ~ (acquire ? "xacquire" : "xrelease") ~ ` xchg $1, $0",
+                             "=r,=*m,0,~{memory}"
+                             ( ` ~ ptr ~ ` elementtype(` ~ type ~ `)` ~ ` %0, ` ~ type ~ ` %1)
+
+                         ret ` ~ type ~ ` %oldValue`,
+                        T
+                    )(address, value);
+                }
+                else version (GNU)
+                {
+                    static if (acquire)
+                    {
+                        /* This is equivalent to GCC's __ATOMIC_HLE_ACQUIRE. */
+                        enum int hleModifier = 1 << 16;
+                    }
+                    else
+                    {
+                        /* This is equivalent to GCC's __ATOMIC_HLE_RELEASE. */
+                        enum int hleModifier = 1 << 17;
+                    }
+
+                    enum int hleOrder = MemoryOrder.seq | hleModifier;
+                    enum exchange = "__atomic_exchange_" ~ ('0' + T.sizeof);
+
+                    import core.internal.traits : AliasSeq;
+                    import core.bitop : bsr;
+                    mixin(q{import gcc.builtins : }, exchange, q{;});
+
+                    alias Int = AliasSeq!(ubyte, ushort, uint, ulong)[T.sizeof.bsr];
+
+                    return cast(T) mixin(exchange)(address, cast(Int) value, hleOrder);
+                }
+                else version (InlineAsm_X86_64_Or_X86)
+                {
+                    import core.bitop : bsr;
+
+                    enum size = T.sizeof.bsr;
+                    enum xacquire = "repne";
+                    enum xrelease = "rep";
+
+                    version (D_InlineAsm_X86_64)
+                    {
+                        enum fullA = ["EAX", "EAX", "EAX", "RAX"][size];
+                        enum fullD = ["EDX", "EDX", "EDX", "RDX"][size];
+                        enum a = ["AL", "AX", "EAX", "RAX"][size];
+
+                        mixin(
+                            "asm @trusted pure nothrow @nogc
+                             {
+                                 /* RCX is address; RDX is value. */
+                                 naked;
+                                  mov " ~ fullA ~ ", " ~ fullD ~ ";
+                                 " ~ (acquire ? xacquire : xrelease) ~ "; xchg [RCX], " ~ a ~ ";
+                                 ret;
+                             }"
+                        );
+                    }
+                    else version (D_InlineAsm_X86)
+                    {
+                        enum a = ["AL", "AX", "EAX"][size];
+
+                        mixin(
+                            "asm @trusted pure nothrow @nogc
+                             {
+                                 naked;
+                                 mov ECX, [ESP + 4]; /* address. */
+                                 mov EAX, [ESP + 8]; /* value. */
+                                 " ~ (acquire ? xacquire : xrelease) ~ "; xchg [ECX], " ~ a ~ ";
+                                 ret;
+                             }"
+                        );
+                    }
+                }
+            }
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    private T interlockedExchange(MemoryOrder order = MemoryOrder.seq, T)(scope shared(T)* address, scope T value)
+    @trusted
+    {
+        if (__ctfe)
+        {
+            T oldValue = *cast(T*) address;
+            *cast(T*) address = value;
+            return oldValue;
+        }
+        else
+        {
+            static if (order == MemoryOrder.acq)
+            {
+                /* atomicExchange rejects acq memory-ordering as invalid, but this is what MSVC does, so: ¯\_(ツ)_/¯ */
+
+                version (LDC)
+                {
+                    import core.internal.atomic : _ordering;
+                    import ldc.intrinsics : llvm_atomic_rmw_xchg;
+
+                    T result = llvm_atomic_rmw_xchg!(T)(address, value, _ordering!order);
+                }
+                else version (GNU)
+                {
+                    import core.internal.traits : AliasSeq;
+                    import core.bitop : bsr;
+                    enum exchange = "__atomic_exchange_" ~ ('0' + T.sizeof);
+                    mixin(q{import gcc.builtins : }, exchange, q{;});
+
+                    alias Int = AliasSeq!(ubyte, ushort, uint, ulong)[T.sizeof.bsr];
+
+                    T result = cast(T) mixin(exchange)(address, cast(Int) value, order);
+                }
+                else
+                {
+                    static assert(false, "This is instantiated only for ARM/AArch64 targets.");
+                }
+            }
+            else
+            {
+                import core.internal.atomic : atomicExchange;
+
+                T result = atomicExchange!(order, true, T)(cast(T*) address, value);
+            }
+
+            version (AArch64_Or_ARM)
+            {
+                /* This is what the Interlocked MSVC intrinsics do. */
+                static if (order == MemoryOrder.acq)
+                {
+                    /* dmb ish */
+                    __builtin_arm_dmb(11);
+                }
+            }
+
+            return result;
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    private T interlockedCAS(MemoryOrder success = MemoryOrder.seq, MemoryOrder failure = success, T)(
+        scope shared(T)* address,
+        scope T valueToSet,
+        return scope T expectedValue
+    ) @trusted pure nothrow @nogc
+    {
+        if (__ctfe)
+        {
+            scope a = ((a) @trusted => cast(T*) a)(address);
+            T oldValue = *a;
+
+            if (oldValue == expectedValue)
+            {
+                *a = valueToSet;
+            }
+
+            return oldValue;
+        }
+        else
+        {
+            import core.internal.atomic : atomicCompareExchangeStrong;
+
+            cast(void) atomicCompareExchangeStrong!(success, failure)(cast(T*) address, &expectedValue, valueToSet);
+
+            version (AArch64_Or_ARM)
+            {
+                /* This is what the Interlocked MSVC intrinsics do. */
+                static if (success == MemoryOrder.acq)
+                {
+                    /* dmb ish */
+                    __builtin_arm_dmb(11);
+                }
+            }
+
+            return expectedValue;
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    private ubyte interlockedCAS128(MemoryOrder success = MemoryOrder.seq, MemoryOrder failure = success)(
+        scope shared(long)* address,
+        long valueToSetHigh,
+        long valueToSetLow,
+        scope long* expectedValue
+    ) @system pure nothrow @nogc
+    {
+        import core.internal.atomic : atomicCompareExchangeStrong;
+
+        version (LittleEndian)
+        {
+            enum size_t lo = 0;
+            enum size_t hi = 1;
+        }
+        else version (BigEndian)
+        {
+            enum size_t lo = 1;
+            enum size_t hi = 0;
+        }
+
+        if (__ctfe)
+        {
+            scope a = ((a) @trusted => cast(long*) a)(address);
+
+            if (a[0] == expectedValue[0] && a[1] == expectedValue[1])
+            {
+                a[lo] = valueToSetLow;
+                a[hi] = valueToSetHigh;
+
+                return 1;
+            }
+
+            expectedValue[0] = a[0];
+            expectedValue[1] = a[1];
+
+            return 0;
+        }
+        else
+        {
+            ulong[2] valueToSet = void;
+            valueToSet[lo] = valueToSetLow;
+            valueToSet[hi] = valueToSetHigh;
+
+            bool result = atomicCompareExchangeStrong!(success, failure)(
+                cast(ulong[2]*) address,
+                cast(ulong[2]*) expectedValue,
+                valueToSet
+            );
+
+            version (AArch64_Or_ARM)
+            {
+                /* This is what the Interlocked MSVC intrinsics do. */
+                static if (success == MemoryOrder.acq)
+                {
+                    /* dmb ish */
+                    __builtin_arm_dmb(11);
+                }
+            }
+
+            return result;
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    private T interlockedOp(
+        string ldcName,
+        string gdcName,
+        string op,
+        MemoryOrder order = MemoryOrder.seq,
+        bool noPrefetch = false,
+        T
+    )(
+        scope shared(T)* address,
+        T operand
+    ) @trusted pure nothrow @nogc
+    {
+        if (__ctfe)
+        {
+            scope a = ((a) @trusted => cast(T*) a)(address);
+            T oldValue = *a;
+            mixin(q{*a }, op, q{= operand;});
+            return oldValue;
+        }
+        else
+        {
+            version (X86_64)
+            {
+                static if (!noPrefetch)
+                {
+                    version (GNU)
+                    {
+                        import gcc.builtins : __builtin_prefetch;
+                        __builtin_prefetch(((a) @trusted => cast(const(void)*) a)(address), 1, 3);
+                    }
+                    else
+                    {
+                        import core.simd : prefetch;
+                        prefetch!(true, 3)(((a) @trusted => cast(const(void)*) a)(address));
+                    }
+                }
+            }
+
+            version (LDC)
+            {
+                enum string name = "llvm_atomic_" ~ ldcName;
+
+                import core.internal.atomic : _ordering;
+                mixin(q{import ldc.intrinsics : }, name, q{;});
+
+                T value = mixin(name)(address, operand, _ordering!order);
+            }
+            else version (GNU)
+            {
+                enum string name = "__atomic_fetch_" ~ gdcName;
+
+                mixin(q{import gcc.builtins : }, name, q{;});
+
+                T value = mixin(name)(address, operand, order);
+            }
+            else
+            {
+                import core.internal.atomic : atomicCompareExchangeWeak, atomicLoad;
+
+                T value = atomicLoad!(MemoryOrder.raw)(address);
+
+                while (
+                    !atomicCompareExchangeWeak!(order, order)(
+                        cast(T*) address,
+                        &value,
+                        mixin(q{value }, op, q{ operand})
+                    )
+                )
+                {}
+            }
+
+            version (AArch64_Or_ARM)
+            {
+                /* This is what the Interlocked MSVC intrinsics do. */
+                static if (order == MemoryOrder.acq)
+                {
+                    /* dmb ish */
+                    __builtin_arm_dmb(11);
+                }
+            }
+
+            return value;
+        }
+    }
+
+    private void interlockedOpTest(string op, alias symbol, T)()
+    {
+        enum ulong fullValue = 0x32515ED8453C5664;
+        enum ulong fullOperandA = 0x4B71C0BCC5836855;
+        enum ulong fullOperandB = 0x2E934F81075982C8;
+
+        shared T value = cast(T) fullValue;
+        shared T oldValue = value;
+        T operandA = cast(T) fullOperandA;
+        T operandB = cast(T) fullOperandB;
+
+        assert(symbol(&value, operandA) == oldValue);
+        assert(value == cast(T) (mixin(q{oldValue }, op, q{ operandA})));
+        oldValue = value;
+
+        assert(symbol(&value, operandB) == oldValue);
+        assert(value == cast(T) (mixin(q{oldValue }, op, q{ operandB})));
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte __inbyte(ushort Port) @system nothrow @nogc
+        {
+            version (LDC_Or_GNU)
+            {
+                return x86In!ubyte(Port);
+            }
+            else
+            {
+                import core.bitop : inp;
+                return inp(Port);
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ushort __inword(ushort Port) @system nothrow @nogc
+        {
+            version (LDC_Or_GNU)
+            {
+                return x86In!ushort(Port);
+            }
+            else
+            {
+                import core.bitop : inpw;
+                return inpw(Port);
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        uint __indword(ushort Port) @system nothrow @nogc
+        {
+            version (LDC_Or_GNU)
+            {
+                return x86In!uint(Port);
+            }
+            else
+            {
+                import core.bitop : inpl;
+                return inpl(Port);
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __outbyte(ushort Port, ubyte Data) @system nothrow @nogc
+        {
+            version (LDC_Or_GNU)
+            {
+                x86Out(Port, Data);
+            }
+            else
+            {
+                import core.bitop : outp;
+                outp(Port, Data);
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __outword(ushort Port, ushort Data) @system nothrow @nogc
+        {
+            version (LDC_Or_GNU)
+            {
+                x86Out(Port, Data);
+            }
+            else
+            {
+                import core.bitop : outpw;
+                outpw(Port, Data);
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __outdword(ushort Port, uint Data) @system nothrow @nogc
+        {
+            version (LDC_Or_GNU)
+            {
+                x86Out(Port, Data);
+            }
+            else
+            {
+                import core.bitop : outpl;
+                outpl(Port, Data);
+            }
+        }
+
+        version (LDC_Or_GNU)
+        {
+            extern(C)
+            pragma(inline, true)
+            private T x86In(T)(ushort port) @system nothrow @nogc
+            {
+                version (LDC)
+                {
+                    import core.bitop : bsr;
+                    import ldc.llvmasm : __ir;
+
+                    enum size = T.sizeof.bsr;
+                    enum type = ["i8", "i16", "i32"][size];
+                    enum a = ["al", "ax", "eax"][size];
+
+                    return __ir!(
+                        `%value = call ` ~ type ~ ` asm sideeffect inteldialect
+                             "in $0, $1",
+                             "={` ~ a ~ `},N{dx},~{memory}"
+                             (i16 %0)
+
+                         ret ` ~ type ~ ` %value`,
+                        T
+                    )(port);
+                }
+                else version (GNU)
+                {
+                    T result;
+
+                    asm @system nothrow @nogc
+                    {
+                        "in %w1, %0" : "=a" (result) : "Nd" (port) : "memory";
+                    }
+
+                    return result;
+                }
+            }
+
+            extern(C)
+            pragma(inline, true)
+            private void x86Out(T)(ushort port, T data) @system nothrow @nogc
+            {
+                version (LDC)
+                {
+                    import core.bitop : bsr;
+                    import ldc.llvmasm : __ir;
+
+                    enum size = T.sizeof.bsr;
+                    enum type = ["i8", "i16", "i32"][size];
+                    enum a = ["al", "ax", "eax"][size];
+
+                    __ir!(
+                        `call void asm sideeffect inteldialect
+                             "out $0, $1",
+                             "N{dx},{` ~ a ~ `},~{memory}"
+                             (i16 %0, ` ~ type ~ ` %1)`,
+                        void
+                    )(port, data);
+                }
+                else version (GNU)
+                {
+                    asm @system nothrow @nogc
+                    {
+                        "out %1, %w0" : : "Nd" (port), "a" (data) : "memory";
+                    }
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __inbytestring(ushort Port, scope ubyte* Buffer, uint Count) @system nothrow @nogc
+        {
+            x86InOutString!'I'(Port, Buffer, Count);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __inwordstring(ushort Port, scope ushort* Buffer, uint Count) @system nothrow @nogc
+        {
+            x86InOutString!'I'(Port, Buffer, Count);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __indwordstring(ushort Port, scope uint* Buffer, uint Count) @system nothrow @nogc
+        {
+            x86InOutString!'I'(Port, Buffer, Count);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __outbytestring(ushort Port, scope ubyte* Buffer, uint Count) @system nothrow @nogc
+        {
+            x86InOutString!'O'(Port, Buffer, Count);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __outwordstring(ushort Port, scope ushort* Buffer, uint Count) @system nothrow @nogc
+        {
+            x86InOutString!'O'(Port, Buffer, Count);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __outdwordstring(ushort Port, scope uint* Buffer, uint Count) @system nothrow @nogc
+        {
+            x86InOutString!'O'(Port, Buffer, Count);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        private void x86InOutString(char io, T)(ushort port, scope T* buffer, uint bufferLength) @system nothrow @nogc
+        {
+            import core.bitop : bsr;
+
+            enum size = T.sizeof.bsr;
+
+            version (X86)
+            {
+                enum indexPrefix = 'E';
+            }
+            else version (X86_64)
+            {
+                enum indexPrefix = 'R';
+            }
+
+            static if (io == 'I')
+            {
+                enum opCode = "ins";
+                enum index = indexPrefix ~ "DI";
+            }
+            else static if (io == 'O')
+            {
+                enum opCode = "outs";
+                enum index = indexPrefix ~ "SI";
+            }
+
+            version (LDC)
+            {
+                import core.bitop : bsr;
+                import ldc.llvmasm : __ir;
+
+                enum char suffix = "bwl"[size];
+                enum type = ["i8", "i16", "i32"][size];
+                enum ptr = llvmIRPtr!type;
+
+                __ir!(
+                    `call {` ~ ptr ~ `, i32} asm
+                     "rep ` ~ opCode ~ suffix ~ `",
+                     "=&{` ~ index ~ `},=&{ecx},{dx},0,1,~{memory}"
+                     (i16 %0, ` ~ ptr ~ ` %1, i32 %2)`,
+                    void
+                )(port, buffer, bufferLength);
+            }
+            else version (GNU)
+            {
+                enum char suffix = "bwl"[size];
+
+                mixin(
+                    `asm @system nothrow @nogc
+                     {
+                           "rep " ~ opCode ~ suffix
+                         : "=` ~ index[1] ~ `" (buffer), "=c" (bufferLength)
+                         : "0" (buffer), "1" (bufferLength), "d" (port)
+                         : "memory";
+                     }`
+                );
+            }
+            else version (InlineAsm_X86_64_Or_X86)
+            {
+                enum char suffix = "bwd"[size];
+
+                version (D_InlineAsm_X86_64)
+                {
+                    mixin(
+                        "asm @trusted pure nothrow @nogc
+                         {
+                             /* CX is port; RDX is buffer; R8D is bufferLength. */
+                             naked;
+                             mov R9, " ~ index ~ "; /* R[DS]I is non-volatile, so we save it in R9. */
+                             mov " ~ index ~ ", RDX;
+                             mov EDX, ECX;
+                             mov ECX, R8D;
+                             rep; " ~ opCode ~ suffix ~ ";
+                             mov " ~ index ~ ", R9;
+                             ret;
+                         }"
+                    );
+                }
+                else version (D_InlineAsm_X86)
+                {
+                    mixin(
+                        "asm @trusted pure nothrow @nogc
+                         {
+                             naked;
+                             mov EAX, " ~ index ~ "; /* E[DS]I is non-volatile, so we save it in EAX. */
+                             mov ECX, [ESP + 12]; /* bufferLength. */
+                             mov " ~ index ~ ", [ESP +  8]; /* buffer. */
+                             mov EDX, [ESP +  4]; /* port. */
+                             rep; " ~ opCode ~ suffix ~ ";
+                             mov " ~ index ~ ", EAX;
+                             ret;
+                         }"
+                    );
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __int2c() @safe pure nothrow @nogc
+        {
+            /+ Theoretically, this could clobber memory and registers, but in practice, on Windows, this just
+               causes an assertion failure for debuggers. So, only the flags are clobbered. +/
+
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir_pure;
+
+                __ir_pure!(`call void asm sideeffect inteldialect "int 0x2c", "~{flags}"()`, void)();
+            }
+            else version (GNU)
+            {
+                asm @trusted pure nothrow @nogc
+                {
+                    "int $0x2c" : : : "cc";
+                }
+            }
+            else version (InlineAsm_X86_64_Or_X86)
+            {
+                asm @trusted pure nothrow @nogc
+                {
+                    int 0x2c;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __invlpg(scope void* Address) @system nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                enum ptr = llvmIRPtr!"i8" ~ " elementtype(i8)";
+
+                __ir!(
+                    `call void asm sideeffect inteldialect "invlpg $0", "*m,~{memory}"(` ~ ptr ~ ` %0)`,
+                    void
+                )(Address);
+            }
+            else version (GNU)
+            {
+                asm @system nothrow @nogc
+                {
+                    "invlpg %0" : : "m" (*cast(const(ubyte)*) Address) : "memory";
+                }
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @system pure nothrow @nogc
+                {
+                    /* RCX is Address. */
+                    naked;
+                    invlpg [RCX];
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @system pure nothrow @nogc
+                {
+                    naked;
+                    mov ECX, [ESP + 4]; /* Address. */
+                    invlpg [ECX];
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __lidt(scope void* Source) @system nothrow @nogc
+        {
+            version (LDC)
+            {
+                import core.bitop : bsr;
+                import ldc.llvmasm : __ir;
+
+                enum type = ["i8", "i16", "i32", "i64"][size_t.sizeof.bsr];
+                enum ptr = llvmIRPtr!type ~ " elementtype(" ~ type ~ ")";
+
+                __ir!(
+                    `call void asm sideeffect inteldialect "lidt $0", "*m,~{memory}"(` ~ ptr ~ ` %0)`,
+                    void
+                )(Source);
+            }
+            else version (GNU)
+            {
+                asm @system nothrow @nogc
+                {
+                    "lidt %0" : : "m" (*cast(const(size_t)*) Source) : "memory";
+                }
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @system pure nothrow @nogc
+                {
+                    /* RCX is Source. */
+                    naked;
+                    lidt [RCX];
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @system pure nothrow @nogc
+                {
+                    naked;
+                    mov ECX, [ESP + 4]; /* Source. */
+                    lidt [ECX];
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ulong __ll_lshift(ulong Mask, int nBit) @safe pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                return Mask << (nBit & 63);
+            }
+            else version (X86)
+            {
+                return Mask << (nBit & 31);
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                version (X86_64)
+                {
+                    assert(
+                           __ll_lshift(0b0010000000000000000000000000000000000000000000000000000000000100, 3)
+                        == 0b0000000000000000000000000000000000000000000000000000000000100000
+                    );
+                    assert(
+                           __ll_lshift(0b0010000000000000000000000000000000000000000000000000000000000100, 34)
+                        == 0b0000000000000000000000000001000000000000000000000000000000000000
+                    );
+                    assert(
+                           __ll_lshift(0b0010000000000000000000000000000000000000000000000000000000000100, 68)
+                        == 0b0000000000000000000000000000000000000000000000000000000001000000
+                    );
+                }
+                else version (X86)
+                {
+                    assert(
+                           __ll_lshift(0b0010000000000000000000000000000000000000000000000000000000000100, 3)
+                        == 0b0000000000000000000000000000000000000000000000000000000000100000
+                    );
+                    assert(
+                           __ll_lshift(0b0010000000000000000000000000000000000000000000000000000000000100, 34)
+                        == 0b1000000000000000000000000000000000000000000000000000000000010000
+                    );
+                    assert(
+                           __ll_lshift(0b0010000000000000000000000000000000000000000000000000000000000100, 68)
+                        == 0b0000000000000000000000000000000000000000000000000000000001000000
+                    );
+                }
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long __ll_rshift(long Mask, int nBit) @safe pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                return Mask >> (nBit & 63);
+            }
+            else version (X86)
+            {
+                return Mask >> (nBit & 31);
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                version (X86_64)
+                {
+                    assert(
+                           __ll_rshift(0b1110000000000000000000000000000000000000000000000000000000000100, 3)
+                        == 0b1111110000000000000000000000000000000000000000000000000000000000
+                    );
+                    assert(
+                           __ll_rshift(0b1110000000000000000000000000000000000000000000000000000000000100, 34)
+                        == 0b1111111111111111111111111111111111111000000000000000000000000000
+                    );
+                    assert(
+                           __ll_rshift(0b1110000000000000000000000000000000000000000000000000000000000100, 68)
+                        == 0b1111111000000000000000000000000000000000000000000000000000000000
+                    );
+                }
+                else version (X86)
+                {
+                    assert(
+                           __ll_rshift(0b1110000000000000000000000000000000000000000000000000000000000100, 3)
+                        == 0b1111110000000000000000000000000000000000000000000000000000000000
+                    );
+                    assert(
+                           __ll_rshift(0b1110000000000000000000000000000000000000000000000000000000000100, 34)
+                        == 0b1111100000000000000000000000000000000000000000000000000000000001
+                    );
+                    assert(
+                           __ll_rshift(0b1110000000000000000000000000000000000000000000000000000000000100, 68)
+                        == 0b1111111000000000000000000000000000000000000000000000000000000000
+                    );
+                }
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ulong __ull_rshift(ulong Mask, int nBit) @safe pure nothrow @nogc
+        {
+            version (X86_64)
+            {
+                return Mask >> (nBit & 63);
+            }
+            else version (X86)
+            {
+                return Mask >> (nBit & 31);
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                version (X86_64)
+                {
+                    assert(
+                           __ull_rshift(0b1110000000000000000000000000000000000000000000000000000000000100, 3)
+                        == 0b0001110000000000000000000000000000000000000000000000000000000000
+                    );
+                    assert(
+                           __ull_rshift(0b1110000000000000000000000000000000000000000000000000000000000100, 34)
+                        == 0b0000000000000000000000000000000000111000000000000000000000000000
+                    );
+                    assert(
+                           __ull_rshift(0b1110000000000000000000000000000000000000000000000000000000000100, 68)
+                        == 0b0000111000000000000000000000000000000000000000000000000000000000
+                    );
+                }
+                else version (X86)
+                {
+                    assert(
+                           __ull_rshift(0b1110000000000000000000000000000000000000000000000000000000000100, 3)
+                        == 0b0001110000000000000000000000000000000000000000000000000000000000
+                    );
+                    assert(
+                           __ull_rshift(0b1110000000000000000000000000000000000000000000000000000000000100, 34)
+                        == 0b0011100000000000000000000000000000000000000000000000000000000001
+                    );
+                    assert(
+                           __ull_rshift(0b1110000000000000000000000000000000000000000000000000000000000100, 68)
+                        == 0b0000111000000000000000000000000000000000000000000000000000000000
+                    );
+                }
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        ushort __lzcnt16(ushort value) @safe pure nothrow @nogc
+        {
+            return leadingZeroCount(value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        uint __lzcnt(uint value) @safe pure nothrow @nogc
+        {
+            return leadingZeroCount(value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        uint _lzcnt_u32(uint value) @safe pure nothrow @nogc
+        {
+            return leadingZeroCount(value);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ulong __lzcnt64(ulong value) @safe pure nothrow @nogc
+        {
+            return leadingZeroCount(value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ulong _lzcnt_u64(ulong value) @safe pure nothrow @nogc
+        {
+            return leadingZeroCount(value);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        @safe pure nothrow @nogc unittest
+        {
+            import core.bitop : bsr;
+            import core.cpuid : hasLzcnt;
+
+            static bool testLzcnt()
+            {
+                version (X86_64_Or_X86)
+                {
+                    assert(__lzcnt16(0) == 16);
+                    assert(__lzcnt16(1) == 15);
+                    assert(__lzcnt16(ushort.max) == 0);
+
+                    assert(__lzcnt(0) == 32);
+                    assert(_lzcnt_u32(0) == 32);
+                    assert(__lzcnt(1) == 31);
+                    assert(_lzcnt_u32(1) == 31);
+                    assert(__lzcnt(uint.max) == 0);
+                    assert(_lzcnt_u32(uint.max) == 0);
+                }
+
+                version (X86_64)
+                {
+                    assert(__lzcnt64(0) == 64);
+                    assert(_lzcnt_u64(0) == 64);
+                    assert(__lzcnt64(1) == 63);
+                    assert(_lzcnt_u64(1) == 63);
+                    assert(__lzcnt64(ulong.max) == 0);
+                    assert(_lzcnt_u64(ulong.max) == 0);
+                }
+
+                return true;
+            }
+
+            static bool testBsr()
+            {
+                version (X86_64_Or_X86)
+                {
+                    assert(__lzcnt16(1) == 0);
+                    assert(__lzcnt16(ushort.max) == 15);
+
+                    assert(__lzcnt(1) == 0);
+                    assert(_lzcnt_u32(1) == 0);
+                    assert(__lzcnt(uint.max) == 31);
+                    assert(_lzcnt_u32(uint.max) == 31);
+                }
+
+                version (X86_64)
+                {
+                    assert(__lzcnt64(1) == 0);
+                    assert(_lzcnt_u64(1) == 0);
+                    assert(__lzcnt64(ulong.max) == 63);
+                    assert(_lzcnt_u64(ulong.max) == 63);
+                }
+
+                return true;
+            }
+
+            if (hasLzcnt)
+            {
+                assert(testLzcnt());
+            }
+            else
+            {
+                assert(testBsr());
+            }
+
+            static assert(testLzcnt());
+        }
+
+        extern(C)
+        pragma(inline, true)
+        private T leadingZeroCount(T)(T value) @safe pure nothrow @nogc
+        {
+            /* We use inline assembly for this, instead of intrinsics or relying on the optimiser,
+               so that lzcnt is emitted even for targets that don't support it, just like MSVC does. */
+
+            import core.bitop : bsr;
+
+            if (__ctfe)
+            {
+                enum T operandSize = cast(T) (T.sizeof << 3);
+                enum uint operandSizeLessOne = operandSize - 1;
+
+                return value == 0 ? operandSize : cast(T) (operandSizeLessOne ^ bsr(value));
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import ldc.llvmasm : __ir_pure;
+
+                    enum size = T.sizeof.bsr;
+                    enum type = ["i8", "i16", "i32", "i64"][size];
+
+                    return __ir_pure!(
+                        `%c = call ` ~ type ~ ` asm inteldialect "lzcnt $0, $1", "=r,r,~{flags}"(` ~ type ~ ` %0)
+                         ret ` ~ type ~ ` %c`,
+                        T
+                    )(value);
+                }
+                else version (GNU)
+                {
+                    T result;
+
+                    asm @trusted pure nothrow @nogc
+                    {
+                        "lzcnt %1, %0" : "=r" (result) : "rm" (value) : "cc";
+                    }
+
+                    return result;
+                }
+                else version (InlineAsm_X86_64_Or_X86)
+                {
+                    enum size = T.sizeof.bsr;
+                    enum a = ["AL", "AX", "EAX", "RAX"][size];
+
+                    version (D_InlineAsm_X86_64)
+                    {
+                        enum c = ["CL", "CX", "ECX", "RCX"][size];
+
+                        mixin(
+                            "asm @trusted pure nothrow @nogc
+                             {
+                                 /* C is value. */
+                                 naked;
+                                 lzcnt " ~ a ~ ", " ~ c ~ ";
+                                 ret;
+                             }"
+                        );
+                    }
+                    else version (D_InlineAsm_X86)
+                    {
+                        mixin(
+                            "asm @trusted pure nothrow @nogc
+                             {
+                                 naked;
+                                 lzcnt " ~ a ~ ", [ESP + 4]; /* [ESP + 4] is value. */
+                                 ret;
+                             }"
+                        );
+                    }
+                }
+            }
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        ushort _tzcnt_u16(ushort value) @safe pure nothrow @nogc
+        {
+            return trailingZeroCount(value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        uint _tzcnt_u32(uint value) @safe pure nothrow @nogc
+        {
+            return trailingZeroCount(value);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ulong _tzcnt_u64(ulong value) @safe pure nothrow @nogc
+        {
+            return trailingZeroCount(value);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        @safe pure nothrow @nogc unittest
+        {
+            import core.bitop : bsr;
+            import core.cpuid : hasLzcnt;
+
+            static bool testTzcnt()
+            {
+                version (X86_64_Or_X86)
+                {
+                    assert(_tzcnt_u16(0) == 16);
+                    assert(_tzcnt_u16(1) == 0);
+                    assert(_tzcnt_u16(1 << 15) == 15);
+                    assert(_tzcnt_u16(ushort.max) == 0);
+
+                    assert(_tzcnt_u32(0) == 32);
+                    assert(_tzcnt_u32(1) == 0);
+                    assert(_tzcnt_u32(1 << 31) == 31);
+                    assert(_tzcnt_u32(uint.max) == 0);
+                }
+
+                version (X86_64)
+                {
+                    assert(_tzcnt_u64(0) == 64);
+                    assert(_tzcnt_u64(1) == 0);
+                    assert(_tzcnt_u64(ulong(1) << 63) == 63);
+                    assert(_tzcnt_u64(ulong.max) == 0);
+                }
+
+                return true;
+            }
+
+            static bool testBsf()
+            {
+                version (X86_64_Or_X86)
+                {
+                    assert(_tzcnt_u16(1) == 0);
+                    assert(_tzcnt_u16(1 << 15) == 15);
+                    assert(_tzcnt_u16(ushort.max) == 0);
+
+                    assert(_tzcnt_u32(1) == 0);
+                    assert(_tzcnt_u32(1 << 31) == 31);
+                    assert(_tzcnt_u32(uint.max) == 0);
+                }
+
+                version (X86_64)
+                {
+                    assert(_tzcnt_u64(1) == 0);
+                    assert(_tzcnt_u64(ulong(1) << 63) == 63);
+                    assert(_tzcnt_u64(ulong.max) == 0);
+                }
+
+                return true;
+            }
+
+            if (hasLzcnt)
+            {
+                assert(testTzcnt());
+            }
+            else
+            {
+                assert(testBsf());
+            }
+
+            static assert(testTzcnt());
+        }
+
+        extern(C)
+        pragma(inline, true)
+        private T trailingZeroCount(T)(T value) @safe pure nothrow @nogc
+        {
+            /* We use inline assembly for this, instead of intrinsics or relying on the optimiser,
+               so that tzcnt is emitted even for targets that don't support it, just like MSVC does. */
+
+            import core.bitop : bsf, bsr;
+
+            if (__ctfe)
+            {
+                enum T operandSize = cast(T) (T.sizeof << 3);
+
+                return value == 0 ? operandSize : cast(T) bsf(value);
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import ldc.llvmasm : __ir_pure;
+
+                    enum size = T.sizeof.bsr;
+                    enum type = ["i8", "i16", "i32", "i64"][size];
+
+                    return __ir_pure!(
+                        `%c = call ` ~ type ~ ` asm inteldialect "tzcnt $0, $1", "=r,r,~{flags}"(` ~ type ~ ` %0)
+                         ret ` ~ type ~ ` %c`,
+                        T
+                    )(value);
+                }
+                else version (GNU)
+                {
+                    T result;
+
+                    asm @trusted pure nothrow @nogc
+                    {
+                        "tzcnt %1, %0" : "=r" (result) : "rm" (value) : "cc";
+                    }
+
+                    return result;
+                }
+                else version (InlineAsm_X86_64_Or_X86)
+                {
+                    enum size = T.sizeof.bsr;
+                    enum a = ["AL", "AX", "EAX", "RAX"][size];
+
+                    version (D_InlineAsm_X86_64)
+                    {
+                        enum c = ["CL", "CX", "ECX", "RCX"][size];
+
+                        mixin(
+                            "asm @trusted pure nothrow @nogc
+                             {
+                                 /* C is value. */
+                                 naked;
+                                 tzcnt " ~ a ~ ", " ~ c ~ ";
+                                 ret;
+                             }"
+                        );
+                    }
+                    else version (D_InlineAsm_X86)
+                    {
+                        mixin(
+                            "asm @trusted pure nothrow @nogc
+                             {
+                                 naked;
+                                 tzcnt " ~ a ~ ", [ESP + 4]; /* [ESP + 4] is value. */
+                                 ret;
+                             }"
+                        );
+                    }
+                }
+            }
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        float4 _mm_cvtsi64x_ss(float4 a, long b) @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                a.array[0] = ctfeX86RoundLongToFloat(b);
+                return a;
+            }
+            else
+            {
+                version (LDC)
+                {
+                    /* LLVM lacks an intrinsic for the 64-bit version of cvtsi2ss, but this
+                       emits said instruction even when optimisations aren't enabled. */
+                    a.array[0] = cast(float) b;
+                    return a;
+                }
+                else version (GNU)
+                {
+                    import gcc.builtins : __builtin_ia32_cvtsi642ss;
+                    return __builtin_ia32_cvtsi642ss(a, b);
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    /* We could use core.simd.__simd_sto for this, but we don't, because doing so causes
+                       DMD to miscompile calls to this function when optimisations are enabled. */
+
+                    enum ubyte REX_W = 0b0100_1000;
+
+                    asm @trusted pure nothrow @nogc
+                    {
+                        /* RCX is a; RDX is b. */
+                        naked;
+                        movdqa XMM0, [RCX];
+                        /* DMD refuses to encode `cvtsi2ss XMM0, RDX`, so we'll encode it by hand. */
+                        db 0xF3, REX_W, 0x0F, 0x2A, 0b11_000_010; /* cvtsi2ss XMM0, RDX */
+                        ret;
+                    }
+                }
+            }
+        }
+
+        @trusted pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                alias convert = _mm_cvtsi64x_ss;
+                float4 floats = 2.0f;
+
+                void check(long value, float result)
+                {
+                    float4 actual = convert(floats, value);
+                    assert(actual.ptr[0] == result);
+                    assert(actual.ptr[1] == 2.0f);
+                    assert(actual.ptr[2] == 2.0f);
+                    assert(actual.ptr[3] == 2.0f);
+                }
+
+                check(6, 6.0f);
+                check(long.min, -twoExp63Float);
+                check(long.max, twoExp63Float);
+                check(9223371761976868864, twoExp63Float);
+                check(9223371761976868863, justUnderTwoExp63Float);
+                check(9223371487098961920, justUnderTwoExp63Float);
+                check(-9223371761976868864, -twoExp63Float);
+                check(-9223371761976868863, -justUnderTwoExp63Float);
+                check(-9223371487098961920, -justUnderTwoExp63Float);
+                check(33554434, 33554432.0f);
+                check(-33554434, -33554432.0f);
+                check(33554438, 33554440.0f);
+                check(-33554438, -33554440.0f);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _mm_cvtss_si64x(float4 value) @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                return ctfeX86RoundFloatToLong(value.array[0]);
+            }
+            else
+            {
+                version (LDC_Or_GNU)
+                {
+                    mixin(q{import }, gccBuiltins, q{ : __builtin_ia32_cvtss2si64;});
+
+                    return __builtin_ia32_cvtss2si64(value);
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    enum ubyte REX_W = 0b0100_1000;
+
+                    asm @trusted pure nothrow @nogc
+                    {
+                        /* RCX is value. */
+                        naked;
+                        /* DMD refuses to encode `cvtss2si RAX, [RCX]`, so we'll encode it by hand. */
+                        db 0xF3, REX_W, 0x0F, 0x2D, 0b00_000_001; /* cvtss2si RAX, [RCX] */
+                        ret;
+                    }
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_mm_cvtss_si64x([0.0f, 0.0f, 1.0f, 2.0f]) == 0);
+                assert(_mm_cvtss_si64x([1.0f, 0.0f, 1.0f, 2.0f]) == 1);
+                assert(_mm_cvtss_si64x([1.5f, 0.0f, 1.0f, 2.0f]) == 2);
+                assert(_mm_cvtss_si64x([2.5f, 0.0f, 1.0f, 2.0f]) == 2);
+                assert(_mm_cvtss_si64x([3.5f, 0.0f, 1.0f, 2.0f]) == 4);
+                assert(_mm_cvtss_si64x([4.5f, 0.0f, 1.0f, 2.0f]) == 4);
+                assert(_mm_cvtss_si64x([4.51f, 0.0f, 1.0f, 2.0f]) == 5);
+                assert(_mm_cvtss_si64x([4.51f, 0.0f, 1.0f, 2.0f]) == 5);
+                assert(_mm_cvtss_si64x([5.49f, 0.0f, 1.0f, 2.0f]) == 5);
+                assert(_mm_cvtss_si64x([33554432.0f, 0.0f, 1.0f, 2.0f]) == 33554432);
+                assert(_mm_cvtss_si64x([-33554432.0f, 0.0f, 1.0f, 2.0f]) == -33554432);
+                assert(_mm_cvtss_si64x([justUnderTwoExp63Float, 0.0f, 1.0f, 2.0f]) == 9223371487098961920);
+                assert(_mm_cvtss_si64x([-twoExp63Float, 0.0f, 1.0f, 2.0f]) == long.min);
+                assert(_mm_cvtss_si64x([twoExp63Float, 0.0f, 1.0f, 2.0f]) == 0x80000000_00000000);
+                assert(_mm_cvtss_si64x([float.nan, 0.0f, 1.0f, 2.0f]) == 0x80000000_00000000);
+                assert(_mm_cvtss_si64x([-float.nan, 0.0f, 1.0f, 2.0f]) == 0x80000000_00000000);
+                assert(_mm_cvtss_si64x([float.infinity, 0.0f, 1.0f, 2.0f]) == 0x80000000_00000000);
+                assert(_mm_cvtss_si64x([-float.infinity, 0.0f, 1.0f, 2.0f]) == 0x80000000_00000000);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long _mm_cvttss_si64x(float4 value) @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                float v = value.array[0];
+
+                if (v < twoExp63Float && v >= -twoExp63Float)
+                {
+                    return cast(long) v;
+                }
+
+                return 0x80000000_00000000;
+            }
+            else
+            {
+                version (LDC_Or_GNU)
+                {
+                    mixin(q{import }, gccBuiltins, q{ : __builtin_ia32_cvttss2si64;});
+
+                    return __builtin_ia32_cvttss2si64(value);
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    enum ubyte REX_W = 0b0100_1000;
+
+                    asm @trusted pure nothrow @nogc
+                    {
+                        /* RCX is value. */
+                        naked;
+                        /* DMD refuses to encode `cvttss2si RAX, [RCX]`, so we'll encode it by hand. */
+                        db 0xF3, REX_W, 0x0F, 0x2C, 0b00_000_001; /* cvttss2si RAX, [RCX] */
+                        ret;
+                    }
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(_mm_cvttss_si64x([0.0f, 0.0f, 1.0f, 2.0f]) == 0);
+                assert(_mm_cvttss_si64x([1.0f, 0.0f, 1.0f, 2.0f]) == 1);
+                assert(_mm_cvttss_si64x([1.5f, 0.0f, 1.0f, 2.0f]) == 1);
+                assert(_mm_cvttss_si64x([2.5f, 0.0f, 1.0f, 2.0f]) == 2);
+                assert(_mm_cvttss_si64x([3.5f, 0.0f, 1.0f, 2.0f]) == 3);
+                assert(_mm_cvttss_si64x([4.5f, 0.0f, 1.0f, 2.0f]) == 4);
+                assert(_mm_cvttss_si64x([4.51f, 0.0f, 1.0f, 2.0f]) == 4);
+                assert(_mm_cvttss_si64x([4.51f, 0.0f, 1.0f, 2.0f]) == 4);
+                assert(_mm_cvttss_si64x([5.49f, 0.0f, 1.0f, 2.0f]) == 5);
+                assert(_mm_cvttss_si64x([33554432.0f, 0.0f, 1.0f, 2.0f]) == 33554432);
+                assert(_mm_cvttss_si64x([-33554432.0f, 0.0f, 1.0f, 2.0f]) == -33554432);
+                assert(_mm_cvttss_si64x([justUnderTwoExp63Float, 0.0f, 1.0f, 2.0f]) == 9223371487098961920);
+                assert(_mm_cvttss_si64x([-twoExp63Float, 0.0f, 1.0f, 2.0f]) == long.min);
+                assert(_mm_cvttss_si64x([twoExp63Float, 0.0f, 1.0f, 2.0f]) == 0x80000000_00000000);
+                assert(_mm_cvttss_si64x([float.nan, 0.0f, 1.0f, 2.0f]) == 0x80000000_00000000);
+                assert(_mm_cvttss_si64x([-float.nan, 0.0f, 1.0f, 2.0f]) == 0x80000000_00000000);
+                assert(_mm_cvttss_si64x([float.infinity, 0.0f, 1.0f, 2.0f]) == 0x80000000_00000000);
+                assert(_mm_cvttss_si64x([-float.infinity, 0.0f, 1.0f, 2.0f]) == 0x80000000_00000000);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+    }
+
+    static if (canPassVectors)
+    {
+        version (X86_64_Or_X86)
+        {
+            extern(C)
+            pragma(inline, true)
+            int4 _mm_extract_si64(int4 Source, int4 Descriptor) @safe pure nothrow @nogc
+            {
+                if (__ctfe)
+                {
+                    return ctfeExtrq(Source, Descriptor);
+                }
+                else
+                {
+                    version (LDC)
+                    {
+                        static if (__traits(targetHasFeature, "sse4a"))
+                        {
+                            import ldc.gccbuiltins_x86 : __builtin_ia32_extrq;
+                            return cast(int4) __builtin_ia32_extrq(cast(long2) Source, cast(byte16) Descriptor);
+                        }
+                        else
+                        {
+                            int4 result;
+
+                            asm @trusted pure nothrow @nogc
+                            {
+                                "extrq %2, %1" : "=x" (result) : "0" (Source), "x" (Descriptor);
+                            }
+
+                            return result;
+                        }
+                    }
+                    else version (GNU)
+                    {
+                        static if (__traits(compiles, () {import gcc.builtins : __builtin_ia32_extrq;}))
+                        {
+                            import gcc.builtins : __builtin_ia32_extrq;
+                            return cast(int4) __builtin_ia32_extrq(cast(long2) Source, cast(ubyte16) Descriptor);
+                        }
+                        else
+                        {
+                            int4 result;
+
+                            asm @trusted pure nothrow @nogc
+                            {
+                                "extrq %2, %1" : "=x" (result) : "0" (Source), "x" (Descriptor);
+                            }
+
+                            return result;
+                        }
+                    }
+                    else version (D_InlineAsm_X86_64)
+                    {
+                        /* __simd can't encode extrq properly. :( */
+                        asm @trusted pure nothrow @nogc
+                        {
+                            /* RCX is Source; RDX is Descriptor. */
+                            naked;
+                            movdqa XMM0, [RCX];
+                            movdqa XMM1, [RDX];
+                            /* DMD doesn't know the extrq instruction, so we encode it by hand. */
+                            db 0x66, 0x0F, 0x79, 0b11_000_001; /* extrq XMM0, XMM1 */
+                            ret;
+                        }
+                    }
+                }
+            }
+
+            @safe pure nothrow @nogc unittest
+            {
+                import core.cpuid : sse4a;
+
+                if (!sse4a)
+                {
+                    return;
+                }
+
+                static bool t(int source, int layout, int4 expected)
+                {
+                    int4 s;
+                    s.array[0] = source;
+                    int4 l;
+                    l.array[0] = layout;
+
+                    return _mm_extract_si64(s, l).array == expected.array;
+                }
+
+                static bool test()
+                {
+                    assert(t(0b00001011_11100101, 4 | (12 << 8), [0, 0, 0, 0]));
+                    assert(t(0b00001011_11100101, 0 | (12 << 8), [0, 0, 0, 0]));
+                    assert(t(0b00001011_11100101, 8 | (0 << 8), [0b11100101, 0, 0, 0]));
+                    assert(t(0b00001011_11100101, 0 | (0 << 8), [0b00001011_11100101, 0, 0, 0]));
+                    assert(t(0b00001011_11100101, 0 | (4 << 8), [0b0000_10111110, 0, 0, 0]));
+
+                    return true;
+                }
+
+                assert(test());
+                static assert(test());
+            }
+
+            extern(C)
+            pragma(inline, true)
+            int4 _mm_extracti_si64(int4 Source, int Length, int Index) @safe pure nothrow @nogc
+            {
+                int4 layout;
+                layout.array[0] = (Length & 0xFF) | ((Index << 8) & 0xFF00);
+
+                return _mm_extract_si64(Source, layout);
+            }
+
+            @safe pure nothrow @nogc unittest
+            {
+                import core.cpuid : sse4a;
+
+                if (!sse4a)
+                {
+                    return;
+                }
+
+                static bool test()
+                {
+                    alias extrq = _mm_extracti_si64;
+
+                    assert(extrq([0b00001011_11100101, 0, 0, 0], 4, 12).array == [0, 0, 0, 0]);
+                    assert(extrq([0b00001011_11100101, 0, 0, 0], 0, 12).array == [0, 0, 0, 0]);
+                    assert(extrq([0b00001011_11100101, 0, 0, 0], 8, 0).array == [0b11100101, 0, 0, 0]);
+                    assert(extrq([0b00001011_11100101, 0, 0, 0], 0, 0).array == [0b00001011_11100101, 0, 0, 0]);
+                    assert(extrq([0b00001011_11100101, 0, 0, 0], 0, 4).array == [0b0000_10111110, 0, 0, 0]);
+
+                    return true;
+                }
+
+                assert(test());
+                static assert(test());
+            }
+
+            pragma(inline, true)
+            private int4 ctfeExtrq()(int4 source, int4 bitLayout) @safe pure nothrow @nogc
+            {
+                ulong lowQuad = ulong(cast(uint) source.array[0]) | (ulong(cast(uint) source.array[1]) << 32);
+
+                uint layout = bitLayout.array[0];
+                ubyte bitCount = layout & 63;
+                ubyte bitIndex = (layout >>> 8) & 63;
+                ulong mask = bitCount == 0 ? ulong.max : (ulong(1) << bitCount) - 1;
+
+                ulong extracted = (lowQuad >>> bitIndex) & (mask);
+                int4 result;
+                result.array[0] = cast(uint) extracted;
+                result.array[1] = cast(uint) (extracted >>> 32);
+
+                return result;
+            }
+
+            @safe pure nothrow @nogc unittest
+            {
+                import core.cpuid : sse4a;
+
+                if (!sse4a)
+                {
+                    return;
+                }
+
+                long2 longParts;
+                longParts.array[0] = 0x9ABAFFF1B15C4933;
+                longParts.array[1] = 0x2488781C67F75A1C;
+                int4 intParts = cast(int4) longParts;
+
+                static int4 layout(ubyte bitCount, ubyte bitIndex)
+                {
+                    return (uint(bitIndex) << 8) | bitCount;
+                }
+
+                foreach (ubyte index; 0 .. (1 << 6))
+                {
+                    foreach (ubyte count; 0 .. (1 << 6))
+                    {
+                        int4 bitLayout = layout(count, index);
+                        int4 result = _mm_extract_si64(intParts, bitLayout);
+
+                        assert(result.array == _mm_extracti_si64(intParts, count, index).array);
+                        assert(result.array == ctfeExtrq(intParts, bitLayout).array);
+                    }
+                }
+
+                assert(_mm_extract_si64(intParts, layout(64, 64)).array == ctfeExtrq(intParts, layout(64, 64)).array);
+                assert(_mm_extract_si64(intParts, layout(65, 65)).array == ctfeExtrq(intParts, layout(65, 65)).array);
+            }
+
+            extern(C)
+            pragma(inline, true)
+            int4 _mm_insert_si64(int4 Source1, int4 Source2) @safe pure nothrow @nogc
+            {
+                if (__ctfe)
+                {
+                    return ctfeInsertq(Source1, Source2);
+                }
+                else
+                {
+                    version (LDC)
+                    {
+                        static if (__traits(targetHasFeature, "sse4a"))
+                        {
+                            import ldc.gccbuiltins_x86 : __builtin_ia32_insertq;
+                            return cast(int4) __builtin_ia32_insertq(cast(long2) Source1, cast(long2) Source2);
+                        }
+                        else
+                        {
+                            int4 result;
+
+                            asm @trusted pure nothrow @nogc
+                            {
+                                "insertq %2, %1" : "=x" (result) : "0" (Source1), "x" (Source2);
+                            }
+
+                            return result;
+                        }
+                    }
+                    else version (GNU)
+                    {
+                        static if (__traits(compiles, () {import gcc.builtins : __builtin_ia32_insertq;}))
+                        {
+                            import gcc.builtins : __builtin_ia32_insertq;
+                            return cast(int4) __builtin_ia32_insertq(cast(long2) Source1, cast(long2) Source2);
+                        }
+                        else
+                        {
+                            int4 result;
+
+                            asm @trusted pure nothrow @nogc
+                            {
+                                "insertq %2, %1" : "=x" (result) : "0" (Source1), "x" (Source2);
+                            }
+
+                            return result;
+                        }
+                    }
+                    else version (D_InlineAsm_X86_64)
+                    {
+                        /* __simd can't encode insertq properly. :( */
+                        asm @trusted pure nothrow @nogc
+                        {
+                            /* RCX is Source1; RDX is Source2. */
+                            naked;
+                            movdqa XMM0, [RCX];
+                            movdqa XMM1, [RDX];
+                            /* DMD doesn't know the insertq instruction, so we encode it by hand. */
+                            db 0xF2, 0x0F, 0x79, 0b11_000_001; /* insertq XMM0, XMM1 */
+                            ret;
+                        }
+                    }
+                }
+            }
+
+            @safe pure nothrow @nogc unittest
+            {
+                import core.cpuid : sse4a;
+
+                if (!sse4a)
+                {
+                    return;
+                }
+
+                static bool t(int destination, int source, int layout, int4 expected)
+                {
+                    int4 d;
+                    d.array[0] = destination;
+                    int4 s;
+                    s.array[0] = source;
+                    s.array[2] = layout;
+
+                    return _mm_insert_si64(d, s).array == expected.array;
+                }
+
+                static bool test()
+                {
+                    assert(t(0b0101, 0b11010, 4 | (12 << 8), [0b10100000_00000101, 0, 0, 0]));
+                    assert(t(0b0101, 0b11010, 0 | (12 << 8), [0b1_10100000_00000101, 0, 0, 0]));
+                    assert(t(0b0101, 0b11010, 2 | (0 << 8), [0b0110, 0, 0, 0]));
+                    assert(t(0b0101, 0b11010, 0 | (0 << 8), [0b11010, 0, 0, 0]));
+                    assert(t(0b0101, 0b11010, 3 | (2 << 8), [0b01001, 0, 0, 0]));
+
+                    return true;
+                }
+
+                assert(test());
+                static assert(test());
+            }
+
+            extern(C)
+            pragma(inline, true)
+            int4 _mm_inserti_si64(int4 Source1, int4 Source2, int Length, int Index) @safe pure nothrow @nogc
+            {
+                int4 layout = Source2;
+                layout.array[2] = (Length & 0xFF) | ((Index << 8) & 0xFF00);
+
+                return _mm_insert_si64(Source1, layout);
+            }
+
+            @safe pure nothrow @nogc unittest
+            {
+                import core.cpuid : sse4a;
+
+                if (!sse4a)
+                {
+                    return;
+                }
+
+                static bool test()
+                {
+                    alias insertq = _mm_inserti_si64;
+
+                    assert(insertq([0b0101, 0, 0, 0], [0b11010], 4, 12).array == [0b10100000_00000101, 0, 0, 0]);
+                    assert(insertq([0b0101, 0, 0, 0], [0b11010], 0, 12).array == [0b1_10100000_00000101, 0, 0, 0]);
+                    assert(insertq([0b0101, 0, 0, 0], [0b11010], 2, 0).array == [0b0110, 0, 0, 0]);
+                    assert(insertq([0b0101, 0, 0, 0], [0b11010], 0, 0).array == [0b11010, 0, 0, 0]);
+                    assert(insertq([0b0101, 0, 0, 0], [0b11010], 3, 2).array == [0b01001, 0, 0, 0]);
+
+                    return true;
+                }
+
+                assert(test());
+                static assert(test());
+            }
+
+            pragma(inline, true)
+            private int4 ctfeInsertq()(int4 destination, int4 source) @safe pure nothrow @nogc
+            {
+                uint layout = source.array[2];
+                ubyte bitCount = layout & 63;
+                ubyte bitIndex = (layout >>> 8) & 63;
+                ulong mask = bitCount == 0 ? ulong.max : (ulong(1) << bitCount) - 1;
+
+                ulong destinationLo = cast(uint) destination.array[0] | (ulong(cast(uint) destination.array[1]) << 32);
+                ulong sourceLo = cast(uint) source.array[0] | (ulong(cast(uint) source.array[1]) << 32);
+
+                ulong inserted = (destinationLo & ~(mask << bitIndex)) | ((sourceLo & mask) << bitIndex);
+                int4 result;
+                result.array[0] = cast(uint) inserted;
+                result.array[1] = cast(uint) (inserted >>> 32);
+
+                return result;
+            }
+
+            @safe pure nothrow @nogc unittest
+            {
+                import core.cpuid : sse4a;
+
+                if (!sse4a)
+                {
+                    return;
+                }
+
+                long2 longDestination;
+                longDestination.array[0] = 0x9ABAFFF1B15C4933;
+                longDestination.array[1] = 0x2488781C67F75A1C;
+                int4 intDestination = cast(int4) longDestination;
+
+                long2 longSource;
+                longSource.array[0] = 0x76D41814E48AE48A;
+                longSource.array[1] = 0xC221DB7BB89ACBC2;
+                int4 intSource = cast(int4) longSource;
+
+                static uint layout(ubyte bitCount, ubyte bitIndex)
+                {
+                    return (uint(bitIndex) << 8) | bitCount;
+                }
+
+                foreach (ubyte index; 0 .. (1 << 6))
+                {
+                    foreach (ubyte count; 0 .. (1 << 6))
+                    {
+                        int4 source = intSource;
+                        source.array[2] = layout(count, index);
+
+                        int4 result = _mm_insert_si64(intDestination, source);
+
+                        assert(result.array == _mm_inserti_si64(intDestination, intSource, count, index).array);
+                        assert(result.array == ctfeInsertq(intDestination, source).array);
+                    }
+                }
+
+                int4 source = intSource;
+
+                source.array[2] = layout(64, 64);
+                assert(_mm_insert_si64(intDestination, source).array == ctfeInsertq(intDestination, source).array);
+                source.array[2] = layout(65, 65);
+                assert(_mm_insert_si64(intDestination, source).array == ctfeInsertq(intDestination, source).array);
+            }
+
+            extern(C)
+            pragma(inline, true)
+            void _mm_stream_sd(scope double* Dest, double2 Source) @safe pure nothrow @nogc
+            {
+                if (__ctfe)
+                {
+                    *Dest = Source.array[0];
+                }
+                else
+                {
+                    version (LDC)
+                    {
+                        static if (__traits(targetHasFeature, "sse4a"))
+                        {
+                            import ldc.llvmasm : __irEx_pure;
+
+                            __irEx_pure!(
+                                "",
+                                `%lowDouble = extractelement <2 x double> %1, i32 0
+                                 store double %lowDouble, ` ~ llvmIRPtr!"double" ~ ` %0, !nontemporal !0`,
+                                 "!0 = !{i32 1}",
+                                void
+                            )(Dest, Source);
+                        }
+                        else
+                        {
+                            asm @trusted pure nothrow @nogc
+                            {
+                                "movntsd %1, %0" : "=m" (*Dest) : "x" (Source);
+                            }
+                        }
+                    }
+                    else version (GNU)
+                    {
+                        static if (__traits(compiles, () {import gcc.builtins : __builtin_ia32_movntsd;}))
+                        {
+                            import gcc.builtins : __builtin_ia32_movntsd;
+                            __builtin_ia32_movntsd(Dest, Source);
+                        }
+                        else
+                        {
+                            asm @trusted pure nothrow @nogc
+                            {
+                                "movntsd %1, %0" : "=m" (*Dest) : "x" (Source);
+                            }
+                        }
+                    }
+                    else version (D_InlineAsm_X86_64)
+                    {
+                        asm @trusted pure nothrow @nogc
+                        {
+                            /* RCX is Dest; RDX is Source. */
+                            naked;
+                            movaps XMM0, [RDX];
+                            /* DMD doesn't know the movntsd instruction, so we encode it by hand. */
+                            db 0xF2, 0x0F, 0x2B, 0b00_000_001; /* movntsd [RCX], XMM0 */
+                            ret;
+                        }
+                    }
+                }
+            }
+
+            /* This is trusted so that it's @safe without DIP1000 enabled. */
+            @trusted pure nothrow @nogc unittest
+            {
+                import core.cpuid : sse4a;
+
+                if (!sse4a)
+                {
+                    return;
+                }
+
+                static bool test()
+                {
+                    double value = double.nan;
+
+                    _mm_stream_sd(&value, [22.0, 31.0]);
+                    assert(value == 22.0);
+                    _mm_stream_sd(&value, [0.0, 31.0]);
+                    assert(value == 0.0);
+                    _mm_stream_sd(&value, [double.nan, 0.0]);
+                    assert(value != value);
+
+                    return true;
+                }
+
+                assert(test());
+                static assert(test());
+            }
+
+            extern(C)
+            pragma(inline, true)
+            void _mm_stream_ss(scope float* Destination, float4 Source) @safe pure nothrow @nogc
+            {
+                if (__ctfe)
+                {
+                    *Destination = Source.array[0];
+                }
+                else
+                {
+                    version (LDC)
+                    {
+                        static if (__traits(targetHasFeature, "sse4a"))
+                        {
+                            import ldc.llvmasm : __irEx_pure;
+
+                            __irEx_pure!(
+                                "",
+                                `%lowFloat = extractelement <4 x float> %1, i32 0
+                                 store float %lowFloat, ` ~ llvmIRPtr!"float" ~ ` %0, !nontemporal !0`,
+                                 "!0 = !{i32 1}",
+                                void
+                            )(Destination, Source);
+                        }
+                        else
+                        {
+                            asm @trusted pure nothrow @nogc
+                            {
+                                "movntss %1, %0" : "=m" (*Destination) : "x" (Source);
+                            }
+                        }
+                    }
+                    else version (GNU)
+                    {
+                        static if (__traits(compiles, () {import gcc.builtins : __builtin_ia32_movntss;}))
+                        {
+                            import gcc.builtins : __builtin_ia32_movntss;
+                            __builtin_ia32_movntss(Destination, Source);
+                        }
+                        else
+                        {
+                            asm @trusted pure nothrow @nogc
+                            {
+                                "movntss %1, %0" : "=m" (*Destination) : "x" (Source);
+                            }
+                        }
+                    }
+                    else version (D_InlineAsm_X86_64)
+                    {
+                        asm @trusted pure nothrow @nogc
+                        {
+                            /* RCX is Destination; RDX is Source. */
+                            naked;
+                            movaps XMM0, [RDX];
+                            /* DMD doesn't know the movntss instruction, so we encode it by hand. */
+                            db 0xF3, 0x0F, 0x2B, 0b00_000_001; /* movntss [RCX], XMM0 */
+                            ret;
+                        }
+                    }
+                }
+            }
+
+            /* This is trusted so that it's @safe without DIP1000 enabled. */
+            @trusted pure nothrow @nogc unittest
+            {
+                import core.cpuid : sse4a;
+
+                if (!sse4a)
+                {
+                    return;
+                }
+
+                static bool test()
+                {
+                    float value = float.nan;
+
+                    _mm_stream_ss(&value, [22.0f, 31.0f, 4.0f, 5.0f]);
+                    assert(value == 22.0f);
+                    _mm_stream_ss(&value, [0.0f, 31.0f, 4.0f, 5.0f]);
+                    assert(value == 0.0f);
+                    _mm_stream_ss(&value, [float.nan, 0.0f, 4.0f, 5.0f]);
+                    assert(value != value);
+
+                    return true;
+                }
+
+                assert(test());
+                static assert(test());
+            }
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        void _mm_stream_si64x(scope long* Destination, long Source) @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                *Destination = Source;
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import ldc.llvmasm : __irEx_pure;
+
+                    __irEx_pure!(
+                        "",
+                        `store i64 %1, ` ~ llvmIRPtr!"i64" ~ ` %0, !nontemporal !0`,
+                         "!0 = !{i32 1}",
+                        void
+                    )(Destination, Source);
+                }
+                else version (GNU)
+                {
+                    import gcc.builtins : __builtin_ia32_movnti64;
+                    __builtin_ia32_movnti64(Destination, Source);
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    enum ubyte REX_W = 0b0100_1000;
+
+                    asm @trusted pure nothrow @nogc
+                    {
+                        /* RCX is Destination; RDX is Source. */
+                        naked;
+                        /* DMD refuses to encode `movnti [RCX], RDX`, so we'll encode it by hand. */
+                        db REX_W, 0x0F, 0xC3, 0b00_010_001; /* movnti [RCX], RDX */
+                        ret;
+                    }
+                }
+            }
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        @trusted pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                long value = long.max;
+
+                _mm_stream_si64x(&value, 0);
+                assert(value == 0);
+                _mm_stream_si64x(&value, 23);
+                assert(value == 23);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+    }
+
+    version (X86_64)
+    {
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        private float ctfeX86RoundLongToFloat()(long value) @trusted pure nothrow @nogc
+        {
+            import core.bitop : bsr;
+
+            if (value == 0)
+            {
+                return 0.0f;
+            }
+            else
+            {
+                long sign = value >> 63;
+                /* If the value is negative, we negate it. */
+                ulong unsignedValue = (value ^ sign) - sign;
+                uint exponent = bsr(unsignedValue);
+
+                if (exponent < 24)
+                {
+                    /* A float can represent this integer exactly, so we just cast the thing. */
+                    return cast(float) value;
+                }
+                else
+                {
+                    /* Beyond exponents of 24-and-more, power-of-two-sized gaps begin to form between the integers
+                       that a float can represent, and we want to round any integers that fall within those gaps.
+
+                       We'll call `exponent - 23` the excess.
+                       The gap between each integer is `1 << excess`, which means that we round the integers
+                       based on the value of their least-significant n-bits, where n is excess.
+                       When those bits are less-than half of the gap-size, we'll round down to the previous
+                       multiple of the gap-size; when those bits are greater-than half of the gap-size, we'll
+                       round up to the next multiple of the gap-size; otherwise, if those bits are exactly
+                       half of the gap-size, the direction we round in depends on the value of the nth-bit of the
+                       integer, where again n is excess: if that bit is 1, we round up, otherwise we round down. */
+
+                    uint excess = exponent - 23;
+                    ulong gapBetweenIntegers = ulong(1) << excess;
+                    ulong halfwayBetweenGap = ulong(1) << (excess - 1);
+                    ulong excessMask = gapBetweenIntegers - 1;
+                    ulong excessBits = unsignedValue & excessMask;
+                    ulong base = ulong(1) << exponent;
+
+                    bool roundUp = excessBits > (halfwayBetweenGap - ((unsignedValue & gapBetweenIntegers) != 0));
+
+                    ulong rounded = ((unsignedValue - base) + (ulong(1) << (excess * roundUp)) - 1) & ~excessMask;
+                    bool shouldGoUpAnExponent = rounded == base;
+
+                    uint asInt = cast(uint) sign << 31;
+                    asInt |= (exponent + shouldGoUpAnExponent + 127) << 23;
+                    asInt |= (cast(uint) (rounded >>> excess)) * !shouldGoUpAnExponent;
+
+                    return *(cast(const(float)*) &asInt);
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(ctfeX86RoundLongToFloat(0) is 0.0f);
+                assert(ctfeX86RoundLongToFloat(1) is 1.0f);
+                assert(ctfeX86RoundLongToFloat(2) is 2.0f);
+                assert(ctfeX86RoundLongToFloat(-1) is -1.0f);
+                assert(ctfeX86RoundLongToFloat(-2) is -2.0f);
+                assert(ctfeX86RoundLongToFloat(9223371761976868864) is twoExp63Float);
+                assert(ctfeX86RoundLongToFloat(9223371761976868863) is justUnderTwoExp63Float);
+                assert(ctfeX86RoundLongToFloat(9223371487098961920) is justUnderTwoExp63Float);
+                assert(ctfeX86RoundLongToFloat(-9223371761976868864) is -twoExp63Float);
+                assert(ctfeX86RoundLongToFloat(-9223371761976868863) is -justUnderTwoExp63Float);
+                assert(ctfeX86RoundLongToFloat(-9223371487098961920) is -justUnderTwoExp63Float);
+                assert(ctfeX86RoundLongToFloat(33554434) is 33554432.0f);
+                assert(ctfeX86RoundLongToFloat(-33554434) is -33554432.0f);
+                assert(ctfeX86RoundLongToFloat(33554438) is 33554440.0f);
+                assert(ctfeX86RoundLongToFloat(-33554438) is -33554440.0f);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        private long ctfeX86RoundFloatToLong()(float value) @trusted pure nothrow @nogc
+        {
+            /* For CTFE, we'll assume that the rounding-mode is the default for x86,
+               which is to round half to the nearest even value. */
+
+            if (value < twoExp63Float && value >= -twoExp63Float)
+            {
+                enum uint implicitBit = 0b0_00000001_00000000000000000000000;
+                enum uint significandMask = 0b0_00000001_11111111111111111111111;
+                enum uint fractionalHalf = 0b0_00000001_00000000000000000000000;
+                enum uint justUnderFractionalHalf = fractionalHalf - 1;
+
+                int asInt = *(cast(const(int)*) &value);
+
+                byte exponent = cast(byte) ((cast(ubyte) (asInt >>> 23)) - 126);
+
+                if (exponent <= -1)
+                {
+                    return 0;
+                }
+
+                uint significand = (asInt & significandMask) | implicitBit;
+                ulong unsignedResult;
+
+                if (exponent >= 24)
+                {
+                    /* The value has no fractional-part, so there's no need to round it. */
+                    unsignedResult = ulong(significand) << (exponent - 24);
+                }
+                else
+                {
+                    /* The value has a fractional-part, so we need to round it. */
+                    uint fraction = (significand << exponent) & significandMask;
+                    uint whole = significand >>> (24 - exponent);
+                    bool adjustment = fraction > ((whole & 1) ? justUnderFractionalHalf : fractionalHalf);
+                    unsignedResult = whole + adjustment;
+                }
+
+                long sign = long(asInt >> 31);
+
+                /* If the sign bit is set, we need to negate the result; we can do that branchlessly
+                   by taking advantage of the fact that `sign` is either 0 or -1.
+                   As `(s ^ 0) - 0 == s`, whereas `(s ^ -1) - -1 == -s`. */
+                return (unsignedResult ^ sign) - sign;
+            }
+
+            return long.min;
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(ctfeX86RoundFloatToLong(0.0f) == 0);
+                assert(ctfeX86RoundFloatToLong(-0.0f) == 0);
+                assert(ctfeX86RoundFloatToLong(float.nan) == 0x80000000_00000000);
+                assert(ctfeX86RoundFloatToLong(-float.nan) == 0x80000000_00000000);
+                assert(ctfeX86RoundFloatToLong(float.infinity) == 0x80000000_00000000);
+                assert(ctfeX86RoundFloatToLong(-float.infinity) == 0x80000000_00000000);
+                assert(ctfeX86RoundFloatToLong(1.0f) == 1);
+                assert(ctfeX86RoundFloatToLong(-1.0f) == -1);
+                assert(ctfeX86RoundFloatToLong(2.5f) == 2);
+                assert(ctfeX86RoundFloatToLong(-2.5f) == -2);
+                assert(ctfeX86RoundFloatToLong(3.5f) == 4);
+                assert(ctfeX86RoundFloatToLong(-3.5f) == -4);
+                assert(ctfeX86RoundFloatToLong(3.49f) == 3);
+                assert(ctfeX86RoundFloatToLong(-3.49f) == -3);
+                assert(ctfeX86RoundFloatToLong(twoExp63Float) == 0x80000000_00000000);
+                assert(ctfeX86RoundFloatToLong(-twoExp63Float) == long.min);
+                assert(ctfeX86RoundFloatToLong(justUnderTwoExp63Float) == 9223371487098961920);
+                assert(ctfeX86RoundFloatToLong(33554432.0f) == 33554432);
+                assert(ctfeX86RoundFloatToLong(-33554432.0f) == -33554432);
+                assert(ctfeX86RoundFloatToLong(33554436.0f) == 33554436);
+                assert(ctfeX86RoundFloatToLong(-33554436.0f) == -33554436);
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        void __movsb(scope ubyte* Destination, const(ubyte)* Source, size_t Count) @system pure nothrow @nogc
+        {
+            return repMovs(Destination, Source, Count);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __movsw(scope ushort* Destination, const(ushort)* Source, size_t Count) @system pure nothrow @nogc
+        {
+            return repMovs(Destination, Source, Count);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __movsd(scope uint* Destination, const(uint)* Source, size_t Count) @system pure nothrow @nogc
+        {
+            return repMovs(Destination, Source, Count);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        void __movsq(scope ulong* Destination, const(ulong)* Source, size_t Count) @system pure nothrow @nogc
+        {
+            return repMovs(Destination, Source, Count);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        private void repMovs(T)(scope T* destination, scope const(T)* source, size_t length) @system pure nothrow @nogc
+        {
+            import core.bitop : bsr;
+
+            if (__ctfe)
+            {
+                foreach (index; 0 .. length)
+                {
+                    destination[index] = source[index];
+                }
+            }
+            else
+            {
+                enum size = T.sizeof.bsr;
+
+                version (LDC)
+                {
+                    import core.bitop : bsr;
+                    import ldc.llvmasm : __ir_pure;
+
+                    enum char suffix = "bwlq"[size];
+                    enum dataType = ["i8", "i16", "i32", "i64"][size];
+                    enum ptr = llvmIRPtr!dataType;
+                    enum lengthType = ["i8", "i16", "i32", "i64"][size_t.sizeof.bsr];
+
+                    version (X86)
+                    {
+                        enum indexPrefix = 'e';
+                    }
+                    else version (X86_64)
+                    {
+                        enum indexPrefix = 'r';
+                    }
+
+                    __ir_pure!(
+                        `call {` ~ ptr ~ `, ` ~ ptr ~ `, ` ~ lengthType ~ `} asm
+                         "rep movs` ~ suffix ~ `",
+                         "=&{` ~ indexPrefix ~ `di},=&{` ~ indexPrefix ~ `si},=&{ecx},0,1,2,~{memory}"
+                         (` ~ ptr ~ ` %0, ` ~ ptr ~ ` %1, ` ~ lengthType ~ ` %2)`,
+                        void
+                    )(destination, source, length);
+                }
+                else version (GNU)
+                {
+                    enum char suffix = "bwlq"[size];
+
+                    asm @system pure nothrow @nogc
+                     {
+                           "rep movs" ~ suffix
+                         : "=D" (destination), "=S" (source), "=c" (length)
+                         : "0" (destination), "1" (source), "2" (length)
+                         : "memory";
+                     }
+                }
+                else version (InlineAsm_X86_64_Or_X86)
+                {
+                    enum char suffix = "bwdq"[size];
+
+                    version (D_InlineAsm_X86_64)
+                    {
+                        mixin(
+                            "asm @system pure nothrow @nogc
+                             {
+                                 /* RCX is destination; RDX is source; R8 is length. */
+                                 naked;
+                                 mov R9, RDI; /* RDI is non-volatile, so we save it in R9. */
+                                 mov RAX, RSI; /* RSI is non-volatile, so we save it in RAX. */
+                                 mov RDI, RCX;
+                                 mov RCX, R8;
+                                 mov RSI, RDX;
+                                 rep; movs" ~ suffix ~ ";
+                                 mov RSI, RAX;
+                                 mov RDI, R9;
+                                 ret;
+                             }"
+                        );
+                    }
+                    else version (D_InlineAsm_X86)
+                    {
+                        mixin(
+                            "asm @system pure nothrow @nogc
+                             {
+                                 naked;
+                                 mov EAX, EDI; /* EDI is non-volatile, so we save it in EAX. */
+                                 mov EDX, ESI; /* ESI is non-volatile, so we save it in EDX. */
+                                 mov EDI, [ESP +  4]; /* destination. */
+                                 mov ESI, [ESP +  8]; /* source. */
+                                 mov ECX, [ESP + 12]; /* length. */
+                                 rep; movs" ~ suffix ~ ";
+                                 mov ESI, EDX;
+                                 mov EDI, EAX;
+                                 ret;
+                             }"
+                        );
+                    }
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test(alias I, alias movs)()
+            {
+                I[8] memory = [I.max, I.max - 1, 2, 3, 4, 5, 6, 7];
+
+                ((d, s) @trusted => movs(d, s, 4))(&memory[3], &memory[2]);
+                assert(memory == [I.max, I.max - 1, 2, 2, 2, 2, 2, 7]);
+
+                ((d, s) @trusted => movs(d, s, 2))(&memory[0], &memory[6]);
+                assert(memory == [2, 7, 2, 2, 2, 2, 2, 7]);
+
+                return true;
+            }
+
+            assert(test!(ubyte, __movsb));
+            static assert(test!(ubyte, __movsb));
+            assert(test!(ushort, __movsw));
+            static assert(test!(ushort, __movsw));
+            assert(test!(uint, __movsd));
+            static assert(test!(uint, __movsd));
+
+            version (X86_64)
+            {
+                assert(test!(ulong, __movsq));
+                static assert(test!(ulong, __movsq));
+            }
+        }
+    }
+
+    pragma(inline, true)
+    byte __noop(Args...)(lazy scope Args args) @safe pure nothrow @nogc
+    {
+        return 0;
+    }
+
+    @safe pure nothrow @nogc unittest
+    {
+        static bool test()
+        {
+            uint counter = 0;
+
+            uint evaluatesWithSideEffect()
+            {
+                ++counter;
+
+                return 7;
+            }
+
+            assert(__noop(evaluatesWithSideEffect()) == 0);
+            assert(counter == 0);
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    void __nop() @safe pure nothrow @nogc
+    {
+        /* Why does this exist? */
+
+        if (__ctfe)
+        {}
+        else
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir_pure;
+
+                __ir_pure!(`call void asm "nop", ""()`, void)();
+            }
+            else version (GNU)
+            {
+                asm @trusted pure nothrow @nogc
+                {
+                    "nop";
+                }
+            }
+            else version (InlineAsm_X86_64_Or_X86)
+            {
+                asm @trusted pure nothrow @nogc
+                {
+                    nop;
+                }
+            }
+        }
+    }
+
+    @safe pure nothrow @nogc unittest
+    {
+        static bool test()
+        {
+            __nop();
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        ushort __popcnt16(ushort value) @safe pure nothrow @nogc
+        {
+            return populationCount(value);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        uint __popcnt(uint value) @safe pure nothrow @nogc
+        {
+            return populationCount(value);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ulong __popcnt64(ulong value) @safe pure nothrow @nogc
+        {
+            return populationCount(value);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        private T populationCount(T)(T value) @safe pure nothrow @nogc
+        {
+            /* The MSVC intrinsics for popcnt always emit the actual popcnt instruction,
+               whereas the LLVM and GCC instrinsics emit the actual instruction only when the target supports it.
+               So, for LDC and GDC, to benefit from constant-folding where possible we check to see
+               if the target supports popcnt before falling back to inline assembly. */
+
+            import core.bitop : popcnt;
+
+            if (__ctfe)
+            {
+                return cast(T) popcnt(value);
+            }
+            else
+            {
+                version (LDC)
+                {
+                    static if (__traits(targetHasFeature, "popcnt"))
+                    {
+                        import ldc.intrinsics : llvm_ctpop;
+                        return llvm_ctpop(value);
+                    }
+                    else
+                    {
+                        import core.bitop : bsr;
+                        import ldc.llvmasm : __ir_pure;
+
+                        enum size = T.sizeof.bsr;
+                        enum type = ["i8", "i16", "i32", "i64"][size];
+
+                        return __ir_pure!(
+                            `%count = call ` ~ type ~ ` asm inteldialect
+                                 "popcnt $0, $1",
+                                 "=r,r,~{flags}"
+                                 (` ~ type ~ ` %0)
+                             ret ` ~ type ~ ` %count`,
+                            T
+                        )(value);
+                    }
+                }
+                else version (GNU)
+                {
+                    /* If we have __builtin_ia32_crc32si, the target has SSE4.2 and thus, almost certainly, popcnt. */
+                    static if (__traits(compiles, () {import gcc.builtins : __builtin_ia32_crc32si;}))
+                    {
+                        static if (T.sizeof <= 4)
+                        {
+                            import gcc.builtins : __builtin_popcount;
+                            return cast(T) __builtin_popcount(value);
+                        }
+                        else
+                        {
+                            import gcc.builtins : __builtin_popcountll;
+                            return __builtin_popcountll(value);
+                        }
+                    }
+                    else
+                    {
+                        T result;
+
+                        asm @trusted pure nothrow @nogc
+                        {
+                            "popcnt %1, %0" : "=r" (result) : "rm" (value) : "cc";
+                        }
+
+                        return result;
+                    }
+                }
+                else
+                {
+                    import core.bitop : _popcnt;
+                    return _popcnt(value);
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            import core.cpuid : hasPopcnt;
+
+            if (!hasPopcnt)
+            {
+                return;
+            }
+
+            static bool test()
+            {
+                assert(__popcnt16(0b00000000_00000000) == 0);
+                assert(__popcnt16(0b10000000_00000000) == 1);
+                assert(__popcnt16(0b10000000_00000010) == 2);
+                assert(__popcnt16(0b11111111_11111111) == 16);
+
+                assert(__popcnt(0b00000000_00000000_00000000_00000000) == 0);
+                assert(__popcnt(0b10000000_00000000_00000000_00000000) == 1);
+                assert(__popcnt(0b10000000_00000000_00000000_00000010) == 2);
+                assert(__popcnt(0b11111111_11111111_11111111_11111111) == 32);
+
+                version (X86_64)
+                {
+                    alias popcnt = __popcnt64;
+                    assert(popcnt(0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000) == 0);
+                    assert(popcnt(0b10000000_00000000_00000000_00000000_00000000_00000000_00000000_00000000) == 1);
+                    assert(popcnt(0b10000000_00000000_00000000_00000000_00000000_00000000_00000000_00000010) == 2);
+                    assert(popcnt(0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111) == 64);
+                }
+
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        ulong __rdtsc() @safe nothrow @nogc
+        {
+            version (LDC_Or_GNU)
+            {
+                return __builtin_ia32_rdtsc();
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    rdtsc;
+                    shl RDX, 32;
+                    or RAX, RDX;
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    rdtsc;
+                    ret;
+                }
+            }
+        }
+
+        @safe nothrow @nogc unittest
+        {
+            foreach (iteration; 0 .. 10_000)
+            {
+                ulong before = __rdtsc();
+                ulong after = __rdtsc();
+
+                if (after != before)
+                {
+                    return;
+                }
+            }
+
+            assert(false);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ulong __rdtscp(scope uint* AUX) @trusted nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __irEx;
+
+                return __irEx!(
+                    "declare {i64, i32} @llvm.x86.rdtscp()",
+                    `%result = call {i64, i32} @llvm.x86.rdtscp()
+
+                     %time = extractvalue {i64, i32} %result, 0
+                     %aux = extractvalue {i64, i32} %result, 1
+
+                     store i32 %aux, ` ~ llvmIRPtr!"i32" ~ ` %0
+
+                     ret i64 %time`,
+                     "",
+                    ulong
+                )(AUX);
+            }
+            else version (GNU)
+            {
+                import gcc.builtins : __builtin_ia32_rdtsc, __builtin_ia32_rdtscp;
+                return __builtin_ia32_rdtscp(AUX);
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    /* RCX is AUX. */
+                    naked;
+                    mov R8, RCX; /* We save RCX in R8 before rdtscp clobbers ECX. */
+                    rdtscp;
+                    mov [R8], ECX;
+                    shl RDX, 32;
+                    or RAX, RDX;
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    push EBX;
+                    mov EBX, [ESP + 8]; /* AUX. */
+                    rdtscp;
+                    mov [EBX], ECX;
+                    pop EBX;
+                    ret;
+                }
+            }
+        }
+
+        /* This is trusted so that it's @safe without DIP1000 enabled. */
+        @trusted nothrow @nogc unittest
+        {
+            uint aux = 0;
+
+            foreach (iteration; 0 .. 10_000)
+            {
+                ulong before = __rdtscp(&aux);
+                ulong after = __rdtscp(&aux);
+
+                if (after != before)
+                {
+                    if (aux == 0)
+                    {
+                        /* Is aux not being written to? Or, is it just zero by happenstance? */
+                        aux = 0xFFFFFFFF;
+                        cast(void) __rdtscp(&aux);
+                        assert(aux != 0xFFFFFFFF);
+                    }
+
+                    return;
+                }
+            }
+
+            assert(false);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        auto __readcr0() @safe nothrow @nogc
+        {
+            return readNumberedRegister!('R', "CR", 0)();
+        }
+
+        extern(C)
+        pragma(inline, true)
+        auto __readcr2() @safe nothrow @nogc
+        {
+            return readNumberedRegister!('R', "CR", 2)();
+        }
+
+        extern(C)
+        pragma(inline, true)
+        auto __readcr3() @safe nothrow @nogc
+        {
+            return readNumberedRegister!('R', "CR", 3)();
+        }
+
+        extern(C)
+        pragma(inline, true)
+        auto __readcr4() @safe nothrow @nogc
+        {
+            return readNumberedRegister!('R', "CR", 4)();
+        }
+
+        extern(C)
+        pragma(inline, true)
+        auto __readcr8() @safe nothrow @nogc
+        {
+            version (X86_64)
+            {
+                return readNumberedRegister!('R', "CR", 8)();
+            }
+            else version (X86)
+            {
+                /* __readcr8 is available on x86, for some reason, and this is what it does. */
+                return readNumberedRegister!('R', "CR", 0, true)();
+            }
+        }
+
+        /* Ideally, we'd define __readdr as a macro that instantiated a template with the register number,
+           but ImportC can't explicitly instantiate templates, so this'll have to do. :\ */
+        extern(C)
+        pragma(inline, true)
+        auto __readdr(uint DebugRegister) @safe nothrow @nogc
+        {
+            /* Dear optimiser, please optimise this. */
+            switch (DebugRegister)
+            {
+                static foreach (number; 0 .. 8)
+                {
+                case number:
+                    return readNumberedRegister!('E', "DR", number);
+                }
+            default:
+                assert(false, "Invalid DebugRegister supplied to __readdr.");
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        private auto readNumberedRegister(char x64Size, string prefix, uint number, bool lock = false)()
+        @safe nothrow @nogc
+        {
+            enum char digit = '0' + number;
+
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                version (X86_64)
+                {
+                    alias T = ulong;
+                    enum type = "i64";
+                }
+                else version (X86)
+                {
+                    alias T = uint;
+                    enum type = "i32";
+                }
+
+                return __ir!(
+                    `%result = call ` ~ type ~ ` asm sideeffect inteldialect
+                         "` ~ (lock ? "lock " : "") ~ `mov $0, ` ~ prefix ~ digit ~ `",
+                         "=r"
+                         ()
+                     ret ` ~ type ~ ` %result`,
+                    T
+                )();
+            }
+            else version (GNU)
+            {
+                version (X86_64)
+                {
+                    ulong result;
+                }
+                else version (X86)
+                {
+                    uint result;
+                }
+
+                asm @trusted nothrow @nogc
+                {
+                    "" ~ (lock ? "lock " : "") ~ "mov %%" ~ prefix ~ digit ~ ", %0" : "=r" (result);
+                }
+
+                return result;
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                mixin(
+                    "asm @trusted nothrow @nogc
+                     {
+                         naked;
+                         " ~ (lock ? "lock; " : "") ~ "mov " ~ x64Size ~ "AX, " ~ prefix ~ digit ~ ";
+                         ret;
+                     }"
+                );
+            }
+            else version (D_InlineAsm_X86)
+            {
+                mixin(
+                    "asm @trusted nothrow @nogc
+                     {
+                         naked;
+                         " ~ (lock ? "lock; " : "") ~ "mov EAX, " ~ prefix ~ digit ~ ";
+                         ret;
+                     }"
+                );
+            }
+        }
+
+        version (LDC)
+        {
+            version (X86_64)
+            {
+                pragma(LDC_intrinsic, "llvm.x86.flags.read.u64")
+                private ulong readEFLAGS() @safe nothrow @nogc;
+            }
+            else version (X86)
+            {
+                pragma(LDC_intrinsic, "llvm.x86.flags.read.u32")
+                private uint readEFLAGS() @safe nothrow @nogc;
+            }
+
+            extern(C)
+            pragma(inline, true)
+            auto __readeflags() @safe nothrow @nogc
+            {
+                return readEFLAGS();
+            }
+        }
+        else
+        {
+            extern(C)
+            pragma(inline, true)
+            RegisterSized __readeflags() @safe nothrow @nogc
+            {
+                version (GNU)
+                {
+                    version (X86_64)
+                    {
+                        import gcc.builtins : __builtin_ia32_readeflags_u64;
+                        return __builtin_ia32_readeflags_u64();
+                    }
+                    else version (X86)
+                    {
+                        import gcc.builtins : __builtin_ia32_readeflags_u32;
+                        return __builtin_ia32_readeflags_u32();
+                    }
+                }
+                else version (D_InlineAsm_X86_64)
+                {
+                    asm @trusted nothrow @nogc
+                    {
+                        naked;
+                        pushfq;
+                        pop RAX;
+                        ret;
+                    }
+                }
+                else version (D_InlineAsm_X86)
+                {
+                    asm @trusted nothrow @nogc
+                    {
+                        naked;
+                        pushfd;
+                        pop EAX;
+                        ret;
+                    }
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        long __readmsr(int register) @safe nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                return __ir!(
+                    `%halves = call {i32, i32} asm sideeffect inteldialect "rdmsr", "={eax},={edx},{ecx}"(i32 %0)
+
+                     %lo32 = extractvalue {i32, i32} %halves, 0
+                     %hi32 = extractvalue {i32, i32} %halves, 1
+
+                     %lo = zext i32 %lo32 to i64
+                     %hi = zext i32 %hi32 to i64
+                     %hi64 = shl i64 %hi, 32
+                     %result = or i64 %hi64, %lo
+
+                     ret i64 %result`,
+                    long
+                )(register);
+            }
+            else version (GNU)
+            {
+                uint lo;
+                uint hi;
+
+                asm @trusted nothrow @nogc
+                {
+                    "rdmsr" : "=a" (lo), "=d" (hi) : "c" (register);
+                }
+
+                return (ulong(hi) << 32) | lo;
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    /* ECX is register. */
+                    naked;
+                    rdmsr;
+                    shl RDX, 32;
+                    or RAX, RDX;
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    mov ECX, [ESP + 4]; /* register. */
+                    rdmsr;
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ulong __readpmc(uint counter) @safe nothrow @nogc
+        {
+            version (LDC_Or_GNU)
+            {
+                return __builtin_ia32_rdpmc(counter);
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    /* ECX is counter. */
+                    naked;
+                    rdpmc;
+                    shl RDX, 32;
+                    or RAX, RDX;
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    mov ECX, [ESP + 8]; /* counter. */
+                    rdpmc;
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        uint __segmentlimit(uint a) @safe nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                return __ir!(
+                    `%result = call i32 asm sideeffect inteldialect "lsl $0, $1", "=r,r,~{flags}"(i32 %0)
+                     ret i32 %result`,
+                    uint
+                )(a);
+            }
+            else version (GNU)
+            {
+                uint result;
+
+                asm @trusted nothrow @nogc
+                {
+                    "lsl %1, %0" : "=r" (result) : "rm" (a) : "cc";
+                }
+
+                return result;
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    /* ECX is a. */
+                    naked;
+                    lsl EAX, ECX;
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    lsl EAX, [ESP + 4]; /* [ESP + 4] is a. */
+                    ret;
+                }
+            }
+        }
+
+        @safe nothrow @nogc unittest
+        {
+            cast(void) __segmentlimit(0);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ulong __shiftleft128(ulong LowPart, ulong HighPart, ubyte Shift) @safe pure nothrow @nogc
+        {
+            return funnelShiftLeft(LowPart, HighPart, Shift);
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(__shiftleft128(0x00FEED00DA00CA70, 0xCAFE00BEEF001230, 24) == 0xBEEF00123000FEED);
+                assert(__shiftleft128(0x00FEED00DA00CA70, 0xCAFE00BEEF001230, 24 + 64) == 0xBEEF00123000FEED);
+
+                return true;
+            }
+
+            assert(test);
+            static assert(test);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ulong __shiftright128(ulong LowPart, ulong HighPart, ubyte Shift) @safe pure nothrow @nogc
+        {
+            return funnelShiftRight(LowPart, HighPart, Shift);
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                assert(__shiftright128(0x00FEED00DA00CA70, 0xCAFE00BEEF001230, 24) == 0x00123000FEED00DA);
+                assert(__shiftright128(0x00FEED00DA00CA70, 0xCAFE00BEEF001230, 24 + 64) == 0x00123000FEED00DA);
+
+                return true;
+            }
+
+            assert(test);
+            static assert(test);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        private I funnelShiftLeft(I)(I low, I high, ubyte shiftCount) @safe pure nothrow @nogc
+        if (__traits(isIntegral, I) && (I.sizeof == 8 || I.sizeof == 4))
+        {
+            enum uint operandBitWidth = I.sizeof << 3;
+            enum uint shiftMask = operandBitWidth - 1;
+
+            static I shiftViaSoftware(I low, I high, ubyte bitsToShift)
+            {
+                alias shift = bitsToShift;
+                return (high << (shift & shiftMask)) | ((low >> 1) >>> (~shift & shiftMask));
+            }
+
+            if (__ctfe)
+            {
+                return shiftViaSoftware(low, high, shiftCount);
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import ldc.intrinsics : llvm_fshl;
+
+                    /* The fshl intrinsic will truncate the shift amount for us,
+                       as per https://llvm.org/docs/LangRef.html#llvm-fshl-intrinsic. */
+                    return llvm_fshl(high, low, I(shiftCount));
+                }
+                else version (GNU)
+                {
+                    return shiftViaSoftware(low, high, shiftCount);
+                }
+                else
+                {
+                    static if (I.sizeof == 8)
+                    {
+                        version (D_InlineAsm_X86_64)
+                        {
+                            asm @trusted pure nothrow @nogc
+                            {
+                                /* RCX is low; RDX is high; R8B is shiftCount. */
+                                naked;
+                                mov RAX, RDX;
+                                mov R9, RCX;
+                                mov RCX, R8;
+                                shld RAX, R9, CL;
+                                ret;
+                            }
+                        }
+                        else
+                        {
+                            return shiftViaSoftware(low, high, shiftCount);
+                        }
+                    }
+                    else static if (I.sizeof == 4)
+                    {
+                        version (D_InlineAsm_X86)
+                        {
+                            asm @trusted pure nothrow @nogc
+                            {
+                                naked;
+                                mov EDX, [ESP +  4]; /* low. */
+                                mov EAX, [ESP +  8]; /* high. */
+                                mov ECX, [ESP + 12]; /* shiftCount. */
+                                shld EAX, EDX, CL;
+                                ret;
+                            }
+                        }
+                        else
+                        {
+                            return shiftViaSoftware(low, high, shiftCount);
+                        }
+                    }
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        private I funnelShiftRight(I)(I low, I high, ubyte shiftCount) @safe pure nothrow @nogc
+        if (__traits(isIntegral, I) && (I.sizeof == 8 || I.sizeof == 4))
+        {
+            enum uint operandBitWidth = I.sizeof << 3;
+            enum uint shiftMask = operandBitWidth - 1;
+
+            static I shiftViaSoftware(I low, I high, ubyte shift)
+            {
+                return (low >>> (shift & shiftMask)) | ((high << 1) << (~shift & shiftMask));
+            }
+
+            if (__ctfe)
+            {
+                return shiftViaSoftware(low, high, shiftCount);
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import ldc.intrinsics : llvm_fshr;
+
+                    /* The fshr intrinsic will truncate the shift amount for us,
+                       as per https://llvm.org/docs/LangRef.html#llvm-fshr-intrinsic. */
+                    return llvm_fshr(high, low, I(shiftCount));
+                }
+                else version (GNU)
+                {
+                    return shiftViaSoftware(low, high, shiftCount);
+                }
+                else
+                {
+                    static if (I.sizeof == 8)
+                    {
+                        version (D_InlineAsm_X86_64)
+                        {
+                            asm @trusted pure nothrow @nogc
+                            {
+                                /* RCX is low; RDX is high; R8B is shiftCount. */
+                                naked;
+                                mov R9, RDX;
+                                mov RAX, RCX;
+                                mov RCX, R8;
+                                shrd RAX, R9, CL;
+                                ret;
+                            }
+                        }
+                        else
+                        {
+                            return shiftViaSoftware(low, high, shiftCount);
+                        }
+                    }
+                    else static if (I.sizeof == 4)
+                    {
+                        version (D_InlineAsm_X86)
+                        {
+                            asm @trusted pure nothrow @nogc
+                            {
+                                naked;
+                                mov EDX, [ESP +  4]; /* low. */
+                                mov EAX, [ESP +  8]; /* high. */
+                                mov ECX, [ESP + 12]; /* shiftCount. */
+                                shrd EAX, EDX, CL;
+                                ret;
+                            }
+                        }
+                        else
+                        {
+                            return shiftViaSoftware(low, high, shiftCount);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        void __sidt(scope void* Destination) @system nothrow @nogc
+        {
+            version (LDC_Or_GNU)
+            {
+                version (X86_64)
+                {
+                    alias Pointee = ubyte[10];
+                }
+                else version (X86)
+                {
+                    alias Pointee = ubyte[6];
+                }
+            }
+
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                version (X86_64)
+                {
+                    enum type = "[10 x i8]";
+                }
+                else version (X86)
+                {
+                    enum type = "[6 x i8]";
+                }
+
+                enum ptr = llvmIRPtr!type ~ " elementtype(" ~ type ~ ")";
+
+                __ir!(
+                    `call void asm sideeffect inteldialect "sidt $0", "=*m"(` ~ ptr ~ ` %0)`,
+                    void
+                )(cast(Pointee*) Destination);
+            }
+            else version (GNU)
+            {
+                asm @system nothrow @nogc
+                {
+                    "sidt %0" : "=m" (*cast(Pointee*) Destination);
+                }
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @system nothrow @nogc
+                {
+                    /* RCX is Destination. */
+                    naked;
+                    sidt [RCX];
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @system nothrow @nogc
+                {
+                    naked;
+                    mov EAX, [ESP + 4]; /* [ESP + 4] is Destination. */
+                    sidt [EAX];
+                    ret;
+                }
+            }
+        }
+
+        @safe nothrow @nogc unittest
+        {
+            version (X86_64)
+            {
+                alias Storage = ubyte[10];
+            }
+            else version (X86)
+            {
+                alias Storage = ubyte[6];
+            }
+
+            scope Storage destination = 0;
+
+            ((scope ref d) @trusted => __sidt(&d[0]))(destination);
+
+            foreach (value; destination)
+            {
+                if (value != 0)
+                {
+                    return;
+                }
+            }
+
+            assert(false);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __stosb(scope ubyte* Destination, ubyte Data, size_t Count) @system pure nothrow @nogc
+        {
+            repStos(Destination, Data, Count);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __stosw(scope ushort* Destination, ushort Data, size_t Count) @system pure nothrow @nogc
+        {
+            repStos(Destination, Data, Count);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __stosd(scope uint* Destination, uint Data, size_t Count) @system pure nothrow @nogc
+        {
+            repStos(Destination, Data, Count);
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        void __stosq(scope ulong* Destination, ulong Data, size_t Count) @system pure nothrow @nogc
+        {
+            repStos(Destination, Data, Count);
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        private void repStos(I)(scope I* destination, I data, size_t length) @system pure nothrow @nogc
+        if (__traits(isIntegral, I))
+        {
+            if (__ctfe)
+            {
+                foreach (index; 0 .. length)
+                {
+                    destination[index] = data;
+                }
+            }
+            else
+            {
+                import core.bitop : bsr;
+
+                enum size = I.sizeof.bsr;
+
+                version (LDC)
+                {
+                    import ldc.llvmasm : __ir_pure;
+
+                    enum char suffix = "bwlq"[size];
+                    enum type = ["i8", "i16", "i32", "i64"][size];
+
+                    version (X86)
+                    {
+                        enum string lengthType = "i32";
+                        enum a = "eax";
+                        enum c = "ecx";
+                        enum di = "edi";
+                    }
+                    else version (X86_64)
+                    {
+                        enum string lengthType = "i64";
+                        enum a = "rax";
+                        enum c = "rcx";
+                        enum di = "rdi";
+                    }
+
+                    __ir_pure!(
+                        `call {` ~ llvmIRPtr!type ~ `, ` ~ lengthType ~ `} asm
+                         "rep stos` ~ suffix ~ `",
+                         "=&{` ~ di ~ `},=&{` ~ c ~ `},0,{` ~ a ~ `},1,~{memory}"
+                         (` ~ llvmIRPtr!type ~ ` %0, ` ~ type ~ ` %1, ` ~ lengthType ~ ` %2)`,
+                        void
+                    )(
+                        destination,
+                        data,
+                        length
+                    );
+                }
+                else version (GNU)
+                {
+                    enum char suffix = "bwlq"[size];
+
+                    asm pure nothrow @nogc
+                    {
+                          "rep stos" ~ suffix
+                        : "=D" (destination), "=c" (length)
+                        : "0" (destination), "1" (length), "a" (data)
+                        : "memory";
+                    }
+                }
+                else version (InlineAsm_X86_64_Or_X86)
+                {
+                    enum char suffix = "bwdq"[size];
+
+                    version (D_InlineAsm_X86_64)
+                    {
+                        mixin(
+                            "asm pure nothrow @nogc
+                             {
+                                 /* RCX is destination; *D* is data; R8 is length. */
+                                 naked;
+                                 mov R9, RDI; /* RDI is non-volatile, so we save it in R9. */
+                                 mov RDI, RCX;
+                                 mov RAX, RDX;
+                                 mov RCX, R8;
+                                 rep; stos" ~ suffix ~ ";
+                                 mov RDI, R9;
+                                 ret;
+                             }"
+                        );
+                    }
+                    else version (D_InlineAsm_X86)
+                    {
+                        mixin(
+                            "asm pure nothrow @nogc
+                             {
+                                 naked;
+                                 mov EDX, EDI; /* EDI is non-volatile, so we save it in EDX. */
+                                 mov EDI, [ESP +  4]; /* destination. */
+                                 mov EAX, [ESP +  8]; /* data. */
+                                 mov ECX, [ESP + 12]; /* length. */
+                                 rep; stos" ~ suffix ~ ";
+                                 mov EDI, EDX;
+                                 ret;
+                             }"
+                        );
+                    }
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test(alias I, alias stos)()
+            {
+                I[8] memory = [I.max, I.max - 1, 2, 3, 4, 5, 6, 7];
+                ((m) @trusted => stos(m, 8, 4))(&memory[1]);
+
+                assert(memory == [I.max, 8, 8, 8, 8, 5, 6, 7]);
+
+                return true;
+            }
+
+            assert(test!(ubyte, __stosb));
+            static assert(test!(ubyte, __stosb));
+            assert(test!(ushort, __stosw));
+            static assert(test!(ushort, __stosw));
+            assert(test!(uint, __stosd));
+            static assert(test!(uint, __stosd));
+
+            version (X86_64)
+            {
+                assert(test!(ulong, __stosq));
+                static assert(test!(ulong, __stosq));
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __svm_clgi() @safe nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                __ir!(
+                    `call void asm sideeffect inteldialect "clgi", ""()`,
+                    void
+                )();
+            }
+            else version (GNU)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    "clgi";
+                }
+            }
+            else version (InlineAsm_X86_64_Or_X86)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    /* DMD doesn't know the clgi instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xDD; /* clgi */
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __svm_invlpga(scope void* Vaddr, int as_id) @system nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                version (X86_64)
+                {
+                    enum a = "rax";
+                }
+                else version (X86)
+                {
+                    enum a = "eax";
+                }
+
+                enum ptr = llvmIRPtr!"i8";
+
+                __ir!(
+                    `call void asm sideeffect inteldialect
+                         "invlpga $0, $1",
+                         "{` ~ a ~ `},{ecx},~{memory}"
+                         (` ~ ptr ~ ` %0, i32 %1)`,
+                    void
+                )(Vaddr, as_id);
+            }
+            else version (GNU)
+            {
+                asm @system nothrow @nogc
+                {
+                    "invlpga %1, %0" : : "a" (Vaddr), "c" (as_id) : "memory";
+                }
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @system nothrow @nogc
+                {
+                    /* RCX is Vaddr; EDX is as_id. */
+                    naked;
+                    mov RAX, RCX;
+                    mov ECX, EDX;
+                    /* DMD doesn't know the invlpga instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xDF; /* invlpga */
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @system nothrow @nogc
+                {
+                    naked;
+                    mov EAX, [ESP + 4]; /* Vaddr. */
+                    mov ECX, [ESP + 8]; /* as_id. */
+                    /* DMD doesn't know the invlpga instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xDF; /* invlpga */
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __svm_skinit(int block_address) @system nothrow @nogc
+        {
+            /* According to AMD's manual, the skinit instruction clobbers every general-purpose register,
+               but the MSVC intrinsics treats it as though it clobbers nothing--the instruction also acts
+               as a jump, so we'll just go with it and pretend it clobbers nothing. */
+
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                __ir!(
+                    `call void asm sideeffect inteldialect "skinit $0", "{eax},~{flags},~{memory}" (i32 %0)`,
+                    void
+                )(block_address);
+            }
+            else version (GNU)
+            {
+                asm @system nothrow @nogc
+                {
+                    "skinit %0" : : "a" (block_address) : "cc", "memory";
+                }
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @system nothrow @nogc
+                {
+                    /* ECX is block_address. */
+                    naked;
+                    mov EAX, ECX;
+                    /* DMD doesn't know the skinit instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xDE; /* skinit */
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @system nothrow @nogc
+                {
+                    naked;
+                    mov EAX, [ESP + 4]; /* block_address. */
+                    /* DMD doesn't know the skinit instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xDE; /* skinit */
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __svm_stgi() @safe nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                __ir!(
+                    `call void asm sideeffect inteldialect "stgi", ""()`,
+                    void
+                )();
+            }
+            else version (GNU)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    "stgi";
+                }
+            }
+            else version (InlineAsm_X86_64_Or_X86)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    /* DMD doesn't know the stgi instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xDC; /* stgi */
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __svm_vmload(size_t VmcbPhysicalAddress) @system nothrow @nogc
+        {
+            version (LDC)
+            {
+                import core.bitop : bsr;
+                import ldc.llvmasm : __ir;
+
+                version (X86_64)
+                {
+                    enum a = "rax";
+                }
+                else version (X86)
+                {
+                    enum a = "eax";
+                }
+
+                enum type = ["i8", "i16", "i32", "i64"][size_t.sizeof.bsr];
+
+                __ir!(
+                    `call void asm sideeffect inteldialect "vmsave $0", "{` ~ a ~ `},~{memory}"(` ~ type ~ `%0)`,
+                    void
+                )(VmcbPhysicalAddress);
+            }
+            else version (GNU)
+            {
+                asm @system nothrow @nogc
+                {
+                    "vmsave %0" : : "a" (VmcbPhysicalAddress) : "memory";
+                }
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @system nothrow @nogc
+                {
+                    /* RCX is VmcbPhysicalAddress. */
+                    naked;
+                    mov RAX, RCX;
+                    /* DMD doesn't know the vmsave instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xDA; /* vmsave */
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @system nothrow @nogc
+                {
+                    naked;
+                    mov EAX, [ESP + 4]; /* VmcbPhysicalAddress. */
+                    /* DMD doesn't know the vmsave instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xDA; /* vmsave */
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __svm_vmrun(size_t VmcbPhysicalAddress) @system nothrow @nogc
+        {
+            version (LDC)
+            {
+                import core.bitop : bsr;
+                import ldc.llvmasm : __ir;
+
+                version (X86_64)
+                {
+                    enum a = "rax";
+                }
+                else version (X86)
+                {
+                    enum a = "eax";
+                }
+
+                enum type = ["i8", "i16", "i32", "i64"][size_t.sizeof.bsr];
+
+                __ir!(
+                    `call void asm sideeffect inteldialect "vmrun $0", "{` ~ a ~ `},~{memory}"(` ~ type ~ `%0)`,
+                    void
+                )(VmcbPhysicalAddress);
+            }
+            else version (GNU)
+            {
+                asm @system nothrow @nogc
+                {
+                    "vmrun %0" : : "a" (VmcbPhysicalAddress) : "memory";
+                }
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @system nothrow @nogc
+                {
+                    /* RCX is VmcbPhysicalAddress. */
+                    naked;
+                    mov RAX, RCX;
+                    /* DMD doesn't know the vmrun instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xD8; /* vmrun */
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @system nothrow @nogc
+                {
+                    naked;
+                    mov EAX, [ESP + 4]; /* VmcbPhysicalAddress. */
+                    /* DMD doesn't know the vmrun instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xD8; /* vmrun */
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __svm_vmsave(size_t VmcbPhysicalAddress) @system nothrow @nogc
+        {
+            version (LDC)
+            {
+                import core.bitop : bsr;
+                import ldc.llvmasm : __ir;
+
+                version (X86_64)
+                {
+                    enum a = "rax";
+                }
+                else version (X86)
+                {
+                    enum a = "eax";
+                }
+
+                enum type = ["i8", "i16", "i32", "i64"][size_t.sizeof.bsr];
+
+                __ir!(
+                    `call void asm sideeffect inteldialect "vmsave $0", "{` ~ a ~ `},~{memory}"(` ~ type ~ `%0)`,
+                    void
+                )(VmcbPhysicalAddress);
+            }
+            else version (GNU)
+            {
+                asm @system nothrow @nogc
+                {
+                    "vmsave %0" : : "a" (VmcbPhysicalAddress) : "memory";
+                }
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @system nothrow @nogc
+                {
+                    /* RCX is VmcbPhysicalAddress. */
+                    naked;
+                    mov RAX, RCX;
+                    /* DMD doesn't know the vmsave instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xDB; /* vmsave */
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @system nothrow @nogc
+                {
+                    naked;
+                    mov EAX, [ESP + 4]; /* VmcbPhysicalAddress. */
+                    /* DMD doesn't know the vmsave instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xDB; /* vmsave */
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __ud2() @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {
+                assert(false, "__ud2");
+            }
+            else
+            {
+                version (LDC)
+                {
+                    import ldc.llvmasm : __ir_pure;
+
+                    __ir_pure!(`call void asm sideeffect "ud2", ""()`, void)();
+                }
+                else version (GNU)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        "ud2";
+                    }
+                }
+                else version (InlineAsm_X86_64_Or_X86)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        ud2;
+                    }
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc
+        {
+            static assert(__traits(compiles, __ud2()));
+
+            static assert(
+                !__traits(
+                    compiles,
+                    ()
+                    {
+                        enum value = ()
+                        {
+                            __ud2();
+                            return 6;
+                        }();
+                    }()
+                )
+            );
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __vmx_off() @safe nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                __ir!(
+                    `call void asm sideeffect inteldialect "vmxoff", "~{flags}" ()`,
+                    void
+                )();
+            }
+            else version (GNU)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    "vmxoff" : : : "cc";
+                }
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    /* DMD doesn't know the vmxoff instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xC4; /* vmxoff */
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    /* DMD doesn't know the vmxoff instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xC4; /* vmxoff */
+                    ret;
+                }
+            }
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte __vmx_on(scope ulong* VmxonRegionPhysicalAddress) @system nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                enum ptr = llvmIRPtr!"i64" ~ " elementtype(i64)";
+                enum bytePtr = llvmIRPtr!"i8";
+
+                ubyte carryFlag;
+                ubyte zeroFlag;
+
+                __ir!(
+                    `%flags = call {i8, i8} asm sideeffect inteldialect
+                         "vmxon $2",
+                         "={@ccc},={@ccz},=*m,~{memory},~{flags}"
+                         (` ~ ptr ~ ` %0)
+
+                     %carryFlag = extractvalue {i8, i8} %flags, 0
+                     %zeroFlag = extractvalue {i8, i8} %flags, 1
+
+                     store i8 %carryFlag, ` ~ bytePtr ~ ` %1
+                     store i8 %zeroFlag, ` ~ bytePtr ~ ` %2`,
+                    void
+                )(VmxonRegionPhysicalAddress, &carryFlag, &zeroFlag);
+
+                return carryFlag ? 2 : zeroFlag;
+            }
+            else version (GNU)
+            {
+                ubyte carryFlag;
+                ubyte zeroFlag;
+
+                asm @system nothrow @nogc
+                {
+                      "vmxon %2"
+                    : "=@ccc" (carryFlag), "=@ccz" (zeroFlag)
+                    : "m" (*VmxonRegionPhysicalAddress)
+                    : "memory", "cc";
+                }
+
+                return carryFlag ? 2 : zeroFlag;
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @system nothrow @nogc
+                {
+                    /* RCX is VmxonRegionPhysicalAddress. */
+                    naked;
+                    /* DMD doesn't know the vmxon instruction, so we encode it by hand. */
+                    db 0xF3, 0x0F, 0xC7, 0b00_110_001; /* vmxon [RCX] */
+                    /* If the carry-flag is set, we return 2, otherwise we return the zero-flag. */
+                    setz AL;
+                    mov EDX, 2;
+                    cmovc EAX, EDX;
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte __vmx_vmclear(scope ulong* VmcsPhysicalAddress) @system nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                enum ptr = llvmIRPtr!"i64" ~ " elementtype(i64)";
+                enum bytePtr = llvmIRPtr!"i8";
+
+                ubyte carryFlag;
+                ubyte zeroFlag;
+
+                __ir!(
+                    `%flags = call {i8, i8} asm sideeffect inteldialect
+                         "vmclear $2",
+                         "={@ccc},={@ccz},=*m,~{memory},~{flags}"
+                         (` ~ ptr ~ ` %0)
+
+                     %carryFlag = extractvalue {i8, i8} %flags, 0
+                     %zeroFlag = extractvalue {i8, i8} %flags, 1
+
+                     store i8 %carryFlag, ` ~ bytePtr ~ ` %1
+                     store i8 %zeroFlag, ` ~ bytePtr ~ ` %2`,
+                    void
+                )(VmcsPhysicalAddress, &carryFlag, &zeroFlag);
+
+                return carryFlag ? 2 : zeroFlag;
+            }
+            else version (GNU)
+            {
+                ubyte carryFlag;
+                ubyte zeroFlag;
+
+                asm @system nothrow @nogc
+                {
+                      "vmclear %2"
+                    : "=@ccc" (carryFlag), "=@ccz" (zeroFlag)
+                    : "m" (*VmcsPhysicalAddress)
+                    : "memory", "cc";
+                }
+
+                return carryFlag ? 2 : zeroFlag;
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @system nothrow @nogc
+                {
+                    /* RCX is VmcsPhysicalAddress. */
+                    naked;
+                    /* DMD doesn't know the vmclear instruction, so we encode it by hand. */
+                    db 0x66, 0x0F, 0xC7, 0b00_110_001; /* vmclear [RCX] */
+                    /* If the carry-flag is set, we return 2, otherwise we return the zero-flag. */
+                    setz AL;
+                    mov EDX, 2;
+                    cmovc EAX, EDX;
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte __vmx_vmlaunch() @trusted nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                enum bytePtr = llvmIRPtr!"i8";
+
+                ubyte carryFlag;
+                ubyte zeroFlag;
+
+                __ir!(
+                    `%flags = call {i8, i8} asm sideeffect inteldialect
+                         "vmlaunch",
+                         "={@ccc},={@ccz},~{memory},~{flags}"
+                         ()
+
+                     %carryFlag = extractvalue {i8, i8} %flags, 0
+                     %zeroFlag = extractvalue {i8, i8} %flags, 1
+
+                     store i8 %carryFlag, ` ~ bytePtr ~ ` %0
+                     store i8 %zeroFlag, ` ~ bytePtr ~ ` %1`,
+                    void
+                )(&carryFlag, &zeroFlag);
+
+                return carryFlag ? 2 : zeroFlag;
+            }
+            else version (GNU)
+            {
+                ubyte carryFlag;
+                ubyte zeroFlag;
+
+                asm @trusted nothrow @nogc
+                {
+                      "vmlaunch"
+                    : "=@ccc" (carryFlag), "=@ccz" (zeroFlag)
+                    :
+                    : "memory", "cc";
+                }
+
+                return carryFlag ? 2 : zeroFlag;
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    /* DMD doesn't know the vmlaunch instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xC2; /* vmlaunch */
+                    /* If the carry-flag is set, we return 2, otherwise we return the zero-flag. */
+                    setz AL;
+                    mov EDX, 2;
+                    cmovc EAX, EDX;
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte __vmx_vmptrld(scope ulong* VmcsPhysicalAddress) @system nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                enum ptr = llvmIRPtr!"i64" ~ " elementtype(i64)";
+                enum bytePtr = llvmIRPtr!"i8";
+
+                ubyte carryFlag;
+                ubyte zeroFlag;
+
+                __ir!(
+                    `%flags = call {i8, i8} asm sideeffect inteldialect
+                         "vmptrld $2",
+                         "={@ccc},={@ccz},=*m,~{memory},~{flags}"
+                         (` ~ ptr ~ ` %0)
+
+                     %carryFlag = extractvalue {i8, i8} %flags, 0
+                     %zeroFlag = extractvalue {i8, i8} %flags, 1
+
+                     store i8 %carryFlag, ` ~ bytePtr ~ ` %1
+                     store i8 %zeroFlag, ` ~ bytePtr ~ ` %2`,
+                    void
+                )(VmcsPhysicalAddress, &carryFlag, &zeroFlag);
+
+                return carryFlag ? 2 : zeroFlag;
+            }
+            else version (GNU)
+            {
+                ubyte carryFlag;
+                ubyte zeroFlag;
+
+                asm @system nothrow @nogc
+                {
+                      "vmptrld %2"
+                    : "=@ccc" (carryFlag), "=@ccz" (zeroFlag)
+                    : "m" (*VmcsPhysicalAddress)
+                    : "memory", "cc";
+                }
+
+                return carryFlag ? 2 : zeroFlag;
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @system nothrow @nogc
+                {
+                    /* RCX is VmcsPhysicalAddress. */
+                    naked;
+                    /* DMD doesn't know the vmptrld instruction, so we encode it by hand. */
+                    db 0x0F, 0xC7, 0b00_110_001; /* vmptrld [RCX] */
+                    /* If the carry-flag is set, we return 2, otherwise we return the zero-flag. */
+                    setz AL;
+                    mov EDX, 2;
+                    cmovc EAX, EDX;
+                    ret;
+                }
+            }
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        void __vmx_vmptrst(scope ulong* VmcsPhysicalAddress) @system nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                enum ptr = llvmIRPtr!"i64" ~ " elementtype(i64)";
+
+                __ir!(
+                    `call void asm sideeffect inteldialect "vmptrst $0", "=*m,~{memory},~{flags}"(` ~ ptr ~ ` %0)`,
+                    void
+                )(VmcsPhysicalAddress);
+            }
+            else version (GNU)
+            {
+                asm @system nothrow @nogc
+                {
+                    "vmptrst %0" : "=m" (*VmcsPhysicalAddress) : : "memory", "cc";
+                }
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @system nothrow @nogc
+                {
+                    /* RCX is VmcsPhysicalAddress. */
+                    naked;
+                    /* DMD doesn't know the vmptrst instruction, so we encode it by hand. */
+                    db 0x0F, 0xC7, 0b00_111_001; /* vmptrst [RCX] */
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @system nothrow @nogc
+                {
+                    naked;
+                    mov ECX, [ESP + 4];
+                    /* DMD doesn't know the vmptrst instruction, so we encode it by hand. */
+                    db 0x0F, 0xC7, 0b00_111_001; /* vmptrst [ECX] */
+                    ret;
+                }
+            }
+        }
+    }
+
+    version (X86_64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte __vmx_vmread(size_t Field, scope size_t* FieldValue) @trusted nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                enum ptr = llvmIRPtr!"i64";
+                enum bytePtr = llvmIRPtr!"i8";
+
+                ubyte carryFlag;
+                ubyte zeroFlag;
+                ulong readField;
+
+                __ir!(
+                    `%flags = call {i8, i8, i64} asm sideeffect inteldialect
+                         "vmread $2, $3",
+                         "={@ccc},={@ccz},=r,r,~{flags}"
+                         (i64 %0)
+
+                     %carryFlag = extractvalue {i8, i8, i64} %flags, 0
+                     %zeroFlag = extractvalue {i8, i8, i64} %flags, 1
+                     %field = extractvalue {i8, i8, i64} %flags, 2
+
+                     store i64 %field, ` ~ ptr ~ ` %1
+                     store i8 %carryFlag, ` ~ bytePtr ~ ` %2
+                     store i8 %zeroFlag, ` ~ bytePtr ~ ` %3`,
+                    void
+                )(Field, &readField, &carryFlag, &zeroFlag);
+
+                *FieldValue = readField;
+
+                return carryFlag ? 2 : zeroFlag;
+            }
+            else version (GNU)
+            {
+                ubyte carryFlag;
+                ubyte zeroFlag;
+
+                asm @trusted nothrow @nogc
+                {
+                      "vmread %3, %2"
+                    : "=@ccc" (carryFlag), "=@ccz" (zeroFlag), "=rm" (*FieldValue)
+                    : "r" (Field)
+                    : "cc";
+                }
+
+                return carryFlag ? 2 : zeroFlag;
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    /* RCX is Field; RDX is FieldValue. */
+                    naked;
+                    /* DMD doesn't know the vmread instruction, so we encode it by hand. */
+                    db 0x0F, 0x78, 0b00_001_010; /* vmread [RDX], RCX */
+                    /* If the carry-flag is set, we return 2, otherwise we return the zero-flag. */
+                    setz AL;
+                    mov EDX, 2;
+                    cmovc EAX, EDX;
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte __vmx_vmresume() @trusted nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                enum bytePtr = llvmIRPtr!"i8";
+
+                ubyte carryFlag;
+                ubyte zeroFlag;
+
+                __ir!(
+                    `%flags = call {i8, i8} asm sideeffect inteldialect
+                         "vmresume",
+                         "={@ccc},={@ccz},~{memory},~{flags}"
+                         ()
+
+                     %carryFlag = extractvalue {i8, i8} %flags, 0
+                     %zeroFlag = extractvalue {i8, i8} %flags, 1
+
+                     store i8 %carryFlag, ` ~ bytePtr ~ ` %0
+                     store i8 %zeroFlag, ` ~ bytePtr ~ ` %1`,
+                    void
+                )(&carryFlag, &zeroFlag);
+
+                return carryFlag ? 2 : zeroFlag;
+            }
+            else version (GNU)
+            {
+                ubyte carryFlag;
+                ubyte zeroFlag;
+
+                asm @trusted nothrow @nogc
+                {
+                      "vmresume"
+                    : "=@ccc" (carryFlag), "=@ccz" (zeroFlag)
+                    :
+                    : "memory", "cc";
+                }
+
+                return carryFlag ? 2 : zeroFlag;
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    /* DMD doesn't know the vmresume instruction, so we encode it by hand. */
+                    db 0x0F, 0x01, 0xC3; /* vmresume */
+                    /* If the carry-flag is set, we return 2, otherwise we return the zero-flag. */
+                    setz AL;
+                    mov EDX, 2;
+                    cmovc EAX, EDX;
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte __vmx_vmwrite(size_t Field, size_t FieldValue) @trusted nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                enum bytePtr = llvmIRPtr!"i8";
+
+                ubyte carryFlag;
+                ubyte zeroFlag;
+
+                __ir!(
+                    `%flags = call {i8, i8} asm sideeffect inteldialect
+                         "vmwrite $2, $3",
+                         "={@ccc},={@ccz},r,r,~{memory},~{flags}"
+                         (i64 %0, i64 %1)
+
+                     %carryFlag = extractvalue {i8, i8} %flags, 0
+                     %zeroFlag = extractvalue {i8, i8} %flags, 1
+
+                     store i8 %carryFlag, ` ~ bytePtr ~ ` %2
+                     store i8 %zeroFlag, ` ~ bytePtr ~ ` %3`,
+                    void
+                )(Field, FieldValue, &carryFlag, &zeroFlag);
+
+                return carryFlag ? 2 : zeroFlag;
+            }
+            else version (GNU)
+            {
+                ubyte carryFlag;
+                ubyte zeroFlag;
+
+                asm @trusted nothrow @nogc
+                {
+                      "vmwrite %3, %2"
+                    : "=@ccc" (carryFlag), "=@ccz" (zeroFlag)
+                    : "r" (Field), "rm" (FieldValue)
+                    : "memory", "cc";
+                }
+
+                return carryFlag ? 2 : zeroFlag;
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    /* RCX is Field; RDX is FieldValue. */
+                    naked;
+                    /* DMD doesn't know the vmwrite instruction, so we encode it by hand. */
+                    db 0x0F, 0x79, 0b11_001_010; /* vmwrite RCX, RDX */
+                    /* If the carry-flag is set, we return 2, otherwise we return the zero-flag. */
+                    setz AL;
+                    mov EDX, 2;
+                    cmovc EAX, EDX;
+                    ret;
+                }
+            }
+        }
+    }
+
+    version (X86_64_Or_X86)
+    {
+        extern(C)
+        pragma(inline, true)
+        void __wbinvd() @safe nothrow @nogc
+        {
+            version (LDC_Or_GNU)
+            {
+                __builtin_ia32_wbinvd();
+            }
+            else version (InlineAsm_X86_64_Or_X86)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    wbinvd;
+                    ret;
+                }
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        auto __writecr0(RegisterSized Data) @safe nothrow @nogc
+        {
+            return writeNumberedRegister!('R', "CR", 0)(Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        auto __writecr2(RegisterSized Data) @safe nothrow @nogc
+        {
+            return writeNumberedRegister!('R', "CR", 2)(Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        auto __writecr3(RegisterSized Data) @safe nothrow @nogc
+        {
+            return writeNumberedRegister!('R', "CR", 3)(Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        auto __writecr4(RegisterSized Data) @safe nothrow @nogc
+        {
+            return writeNumberedRegister!('R', "CR", 4)(Data);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        auto __writecr8(RegisterSized Data) @safe nothrow @nogc
+        {
+            version (X86_64)
+            {
+                return writeNumberedRegister!('R', "CR", 8)(Data);
+            }
+            else version (X86)
+            {
+                /* __writecr8 is available on x86, for some reason, and this is what it does. */
+                return writeNumberedRegister!('R', "CR", 0, true)(Data);
+            }
+        }
+
+        /* Ideally, we'd define __writedr as a macro that instantiated a template with the register number,
+           but ImportC can't explicitly instantiate templates, so this'll have to do. :\ */
+        extern(C)
+        pragma(inline, true)
+        auto __writedr(uint DebugRegister, RegisterSized DebugValue) @safe nothrow @nogc
+        {
+            /* Dear optimiser, please optimise this. */
+            switch (DebugRegister)
+            {
+                static foreach (number; 0 .. 8)
+                {
+                case number:
+                    return writeNumberedRegister!('E', "DR", number)(DebugValue);
+                }
+            default:
+                assert(false, "Invalid DebugRegister supplied to __writedr.");
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        private void writeNumberedRegister(char x64Size, string prefix, uint number, bool lock = false, T)(T Data)
+        @safe nothrow @nogc
+        {
+            enum char digit = '0' + number;
+
+            version (LDC)
+            {
+                import core.bitop : bsr;
+                import ldc.llvmasm : __ir;
+
+                enum size = T.sizeof.bsr;
+                enum type = ["i8", "i16", "i32", "i64"][size];
+
+                return __ir!(
+                    `call void asm sideeffect inteldialect
+                         "` ~ (lock ? "lock " : "") ~ `mov ` ~ prefix ~ digit ~ `, $0",
+                         "r"
+                         (` ~ type ~ ` %0)`,
+                    void
+                )(Data);
+            }
+            else version (GNU)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    "" ~ (lock ? "lock " : "") ~ "mov %0, %%" ~ prefix ~ digit : : "r" (Data);
+                }
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                mixin(
+                    "asm @trusted nothrow @nogc
+                     {
+                         naked;
+                         " ~ (lock ? "lock; " : "") ~ "mov " ~ prefix ~ digit ~ ", " ~ x64Size ~ "AX;
+                         ret;
+                     }"
+                );
+            }
+            else version (D_InlineAsm_X86)
+            {
+                mixin(
+                    "asm @trusted nothrow @nogc
+                     {
+                         naked;
+                         " ~ (lock ? "lock; " : "") ~ "mov " ~ prefix ~ digit ~ ", EAX;
+                         ret;
+                     }"
+                );
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __writeeflags(RegisterSized Value) @system nothrow @nogc
+        {
+            version (LDC_Or_GNU)
+            {
+                version (X86_64)
+                {
+                    mixin(q{import }, gccBuiltins, q{ : __builtin_ia32_writeeflags_u64;});
+                    __builtin_ia32_writeeflags_u64(Value);
+                }
+                else version (X86)
+                {
+                    mixin(q{import }, gccBuiltins, q{ : __builtin_ia32_writeeflags_u32;});
+                    __builtin_ia32_writeeflags_u32(Value);
+                }
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    /* RCX is Value. */
+                    naked;
+                    push RCX;
+                    popfq;
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    mov EAX, [ESP + 4]; /* Value. */
+                    push EAX;
+                    popfd;
+                    ret;
+                }
+            }
+        }
+
+        /* This test relies on the compiler inlining the calls to __readeflags/__writeeflags,
+           which we can't rely on always, hence the `version (none)`. */
+        version (none)
+        {
+            @safe nothrow @nogc unittest
+            {
+                enum RegisterSized mask = 0b110111010101;
+
+                const originalFlags = __readeflags();
+
+                (() @trusted => __writeeflags(originalFlags | 0b101))();
+                assert((__readeflags() & mask) == ((originalFlags | 0b101) & mask));
+
+                version (GNU)
+                {
+                    asm @trusted nothrow @nogc
+                    {
+                        "cmp %%eax, %%eax" : : : "cc";
+                    }
+                }
+                else version (InlineAsm_X86_64_Or_X86)
+                {
+                    asm @trusted nothrow @nogc
+                    {
+                        cmp EAX, EAX;
+                    }
+                }
+
+                enum zeroFlag = 0b1000000;
+                assert(__readeflags() & zeroFlag);
+            }
+        }
+
+        extern(C)
+        pragma(inline, true)
+        void __writemsr(uint Register, ulong Value) @safe nothrow @nogc
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir;
+
+                return __ir!(
+                    `call void asm sideeffect inteldialect "wrmsr", "{eax},{edx},{ecx}"(i32 %0, i32 %1, i32 %2)`,
+                    void
+                )(cast(uint) Value, cast(uint) (Value >>> 32), Register);
+            }
+            else version (GNU)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    "wrmsr" : : "a" (cast(uint) Value), "d" (cast(uint) (Value >>> 32)), "c" (Register);
+                }
+            }
+            else version (D_InlineAsm_X86_64)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    /* ECX is Register; RDX is Value. */
+                    naked;
+                    mov RAX, RDX;
+                    shr RDX, 32;
+                    wrmsr;
+                    ret;
+                }
+            }
+            else version (D_InlineAsm_X86)
+            {
+                asm @trusted nothrow @nogc
+                {
+                    naked;
+                    mov ECX, [ESP +  4]; /* Register. */
+                    mov EAX, [ESP +  8]; /* Low half of Value. */
+                    mov EDX, [ESP + 12]; /* High half of Value. */
+                    wrmsr;
+                    ret;
+                }
+            }
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    void _ReadBarrier() @safe pure nothrow @nogc
+    {
+        readWriteBarrier();
+    }
+
+    extern(C)
+    pragma(inline, true)
+    void _WriteBarrier() @safe pure nothrow @nogc
+    {
+        readWriteBarrier();
+    }
+
+    extern(C)
+    pragma(inline, true)
+    void _ReadWriteBarrier() @safe pure nothrow @nogc
+    {
+        readWriteBarrier();
+    }
+
+    extern(C)
+    pragma(inline, true)
+    private void readWriteBarrier() @safe pure nothrow @nogc
+    {
+        if (__ctfe)
+        {}
+        else
+        {
+            version (LDC)
+            {
+                import ldc.llvmasm : __ir_pure;
+
+                __ir_pure!(`fence syncscope("singlethread") seq_cst`, void)();
+            }
+            else version (GNU)
+            {
+                asm @trusted pure nothrow @nogc
+                {
+                    "" : : : "memory";
+                }
+            }
+            else version (InlineAsm_X86_64_Or_X86)
+            {
+                asm @trusted pure nothrow @nogc
+                {}
+            }
+            else
+            {
+                static assert(false);
+            }
+        }
+    }
+
+    @safe pure nothrow unittest
+    {
+        static bool test()
+        {
+            version (D_BetterC)
+            {
+                _ReadBarrier();
+                _WriteBarrier();
+                _ReadWriteBarrier();
+            }
+            else
+            {
+                uint[] data = new uint[32];
+
+                data[16] = 1;
+                _ReadBarrier();
+                data[16] = 2;
+                _WriteBarrier();
+                data[16] = 3;
+                _ReadWriteBarrier();
+                data[16] = 4;
+                /* In the resulting binary, all these writes should actually occur. */
+
+                assert(data[16] == 4);
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ubyte _BitScanForward(scope uint* Index, uint Mask) @safe pure nothrow @nogc
+    {
+        import core.bitop : bsf;
+        return bitScan!bsf(Index, Mask);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ubyte _BitScanReverse(scope uint* Index, uint Mask) @safe pure nothrow @nogc
+    {
+        import core.bitop : bsr;
+        return bitScan!bsr(Index, Mask);
+    }
+
+    version (X86_64_Or_AArch64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _BitScanForward64(scope uint* Index, ulong Mask) @safe pure nothrow @nogc
+        {
+            import core.bitop : bsf;
+            return bitScan!bsf(Index, Mask);
+        }
+        extern(C)
+        pragma(inline, true)
+        ubyte _BitScanReverse64(scope uint* Index, ulong Mask) @safe pure nothrow @nogc
+        {
+            import core.bitop : bsr;
+            return bitScan!bsr(Index, Mask);
+        }
+    }
+
+    /* This is trusted so that it's @safe without DIP1000 enabled. */
+    @trusted pure nothrow @nogc unittest
+    {
+        static bool test()
+        {
+            uint index;
+
+            assert(_BitScanReverse(&index, 0) == 0);
+            assert(_BitScanReverse(&index, 0b101) == 1);
+            assert(index == 2);
+            assert(_BitScanReverse(&index, uint.max) == 1);
+            assert(index == 31);
+
+            assert(_BitScanForward(&index, 0) == 0);
+            assert(_BitScanForward(&index, 0b101) == 1);
+            assert(index == 0);
+            assert(_BitScanForward(&index, 0b100) == 1);
+            assert(index == 2);
+            assert(_BitScanForward(&index, uint.max) == 1);
+            assert(index == 0);
+
+            version (X86_64_Or_AArch64)
+            {
+                assert(_BitScanReverse64(&index, 0) == 0);
+                assert(_BitScanReverse64(&index, 0b101) == 1);
+                assert(index == 2);
+                assert(_BitScanReverse64(&index, ulong.max) == 1);
+                assert(index == 63);
+
+                assert(_BitScanForward64(&index, 0) == 0);
+                assert(_BitScanForward64(&index, 0b101) == 1);
+                assert(index == 0);
+                assert(_BitScanForward64(&index, 0b100) == 1);
+                assert(index == 2);
+                assert(_BitScanForward64(&index, ulong.max) == 1);
+                assert(index == 0);
+            }
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    extern(C)
+    pragma(inline, true)
+    private ubyte bitScan(alias scan, I)(scope uint* index, I mask) @safe pure nothrow @nogc
+    {
+        if (__ctfe)
+        {
+            if (mask == 0)
+            {
+                return 0;
+            }
+            else
+            {
+                *index = scan(mask);
+                return 1;
+            }
+        }
+        else
+        {
+            *index = scan(mask);
+            return mask != 0;
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ubyte _bittest(scope const(int)* a, int b) pure nothrow @nogc
+    {
+        return bitTestOperation(a, b);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ubyte _bittestandcomplement(scope int* a, int b) pure nothrow @nogc
+    {
+        return bitTestOperation!"^="(a, b);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ubyte _bittestandreset(scope int* a, int b) pure nothrow @nogc
+    {
+        return bitTestOperation!"&= ~"(a, b);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ubyte _bittestandset(scope int* a, int b) pure nothrow @nogc
+    {
+        return bitTestOperation!"|="(a, b);
+    }
+
+    version (X86_64_Or_AArch64)
+    {
+        extern(C)
+        pragma(inline, true)
+        ubyte _bittest64(scope const(long)* a, long b) pure nothrow @nogc
+        {
+            return bitTestOperation(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _bittestandcomplement64(scope long* a, long b) pure nothrow @nogc
+        {
+            return bitTestOperation!"^="(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _bittestandreset64(scope long* a, long b) pure nothrow @nogc
+        {
+            return bitTestOperation!"&= ~"(a, b);
+        }
+
+        extern(C)
+        pragma(inline, true)
+        ubyte _bittestandset64(scope long* a, long b) pure nothrow @nogc
+        {
+            return bitTestOperation!"|="(a, b);
+        }
+    }
+
+    @system pure nothrow @nogc unittest
+    {
+        enum ulong datumA = 0b0111111110100010110111000101011101001111001100111111101100010100;
+        enum ulong datumB = 0b0001001000011101110011000010011010101000101000111001000001101110;
+        enum ulong datumC = 0b1010010101000100010111111111000100001000010010111000100111100110;
+        enum ulong datumD = 0b1011110000010110101001111110000110000011001100101010111100011101;
+
+        static bool test(alias bt, alias btc, alias btr, alias bts, T)()
+        {
+            scope T[4] data = [cast(T) datumA, cast(T) datumB, cast(T) datumC, cast(T) datumD];
+
+            assert(btr(&data[0], T(0)) == 0);
+            assert(data[0] == cast(T) 0b0111111110100010110111000101011101001111001100111111101100010100);
+            assert(bts(&data[0], T(0)) == 0);
+            assert(data[0] == cast(T) 0b0111111110100010110111000101011101001111001100111111101100010101);
+            assert(btc(&data[0], T(0)) == 1);
+            assert(data[0] == cast(T) 0b0111111110100010110111000101011101001111001100111111101100010100);
+            assert(bt(cast(const(T)*) &data[0], T(0)) == 0);
+            assert(data[0] == cast(T) 0b0111111110100010110111000101011101001111001100111111101100010100);
+
+            assert(btr(&data[0], T(2)) == 1);
+            assert(data[0] == cast(T) 0b0111111110100010110111000101011101001111001100111111101100010000);
+            assert(btc(&data[0], T(2)) == 0);
+            assert(data[0] == cast(T) 0b0111111110100010110111000101011101001111001100111111101100010100);
+            assert(bts(&data[0], T(2)) == 1);
+            assert(data[0] == cast(T) 0b0111111110100010110111000101011101001111001100111111101100010100);
+            assert(bt(cast(const(T)*) &data[0], T(2)) == 1);
+            assert(data[0] == cast(T) 0b0111111110100010110111000101011101001111001100111111101100010100);
+
+            assert(btr(&data[0], cast(T) ((T.sizeof << 3) * 3)) == 1);
+            assert(data[3] == cast(T) 0b1011110000010110101001111110000110000011001100101010111100011100);
+            assert(bts(&data[0], cast(T) ((T.sizeof << 3) * 3)) == 0);
+            assert(data[3] == cast(T) 0b1011110000010110101001111110000110000011001100101010111100011101);
+            assert(btc(&data[0], cast(T) ((T.sizeof << 3) * 3)) == 1);
+            assert(data[3] == cast(T) 0b1011110000010110101001111110000110000011001100101010111100011100);
+            assert(bt(cast(const(T)*) &data[0], cast(T) ((T.sizeof << 3) * 3)) == 0);
+            assert(data[3] == cast(T) 0b1011110000010110101001111110000110000011001100101010111100011100);
+
+            assert(btr(&data[0], cast(T) ((T.sizeof << 3) * 3 + 1)) == 0);
+            assert(data[3] == cast(T) 0b1011110000010110101001111110000110000011001100101010111100011100);
+            assert(btc(&data[0], cast(T) ((T.sizeof << 3) * 3 + 1)) == 0);
+            assert(data[3] == cast(T) 0b1011110000010110101001111110000110000011001100101010111100011110);
+            assert(bts(&data[0], cast(T) ((T.sizeof << 3) * 3 + 1)) == 1);
+            assert(data[3] == cast(T) 0b1011110000010110101001111110000110000011001100101010111100011110);
+            assert(bt(cast(const(T)*) &data[0], cast(T) ((T.sizeof << 3) * 3 + 1)) == 1);
+            assert(data[3] == cast(T) 0b1011110000010110101001111110000110000011001100101010111100011110);
+
+            return true;
+        }
+
+        assert(test!(_bittest, _bittestandcomplement, _bittestandreset, _bittestandset, int)());
+        static assert(test!(_bittest, _bittestandcomplement, _bittestandreset, _bittestandset, int)());
+
+        version (X86_64_Or_AArch64)
+        {
+            assert(test!(_bittest64, _bittestandcomplement64, _bittestandreset64, _bittestandset64, long)());
+            static assert(test!(_bittest64, _bittestandcomplement64, _bittestandreset64, _bittestandset64, long)());
+        }
+    }
+
+    extern(C)
+    pragma(inline, true)
+    private ubyte bitTestOperation(string binaryOp = null, I)(scope I* integers, I bitIndex)
+    if (__traits(isIntegral, I))
+    {
+        import core.bitop : bsr, popcnt;
+
+        enum uint bitCount = I.sizeof << 3;
+        enum uint bitShift = bitCount.bsr;
+        enum I bitMask = bitCount - 1;
+
+        scope I* integer = integers + (bitIndex >> bitShift);
+        const I mask = I(1) << (bitIndex & bitMask);
+        const I bitTest = *integer & mask;
+
+        static if (binaryOp !is null)
+        {
+            mixin(q{*integer }, binaryOp, q{mask;});
+        }
+
+        return bitTest != 0;
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ulong _byteswap_uint64(ulong val) @safe pure nothrow @nogc
+    {
+        import core.bitop : bswap;
+        return bswap(val);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    uint _byteswap_ulong(uint val) @safe pure nothrow @nogc
+    {
+        import core.bitop : bswap;
+        return bswap(val);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ushort _byteswap_ushort(ushort val) @safe pure nothrow @nogc
+    {
+        /* core.bitop.byteswap doesn't work for BetterC. */
+        return cast(ushort) (((val >> 8) & 0xFF) | ((val << 8) & 0xFF00u));
+    }
+
+    @safe pure nothrow @nogc unittest
+    {
+        assert(_byteswap_uint64(0x01234567_89ABCDEF) == 0xEFCDAB89_67452301);
+        static assert(_byteswap_uint64(0x01234567_89ABCDEF) == 0xEFCDAB89_67452301);
+        assert(_byteswap_ulong(0x01234567) == 0x67452301);
+        static assert(_byteswap_ulong(0x01234567) == 0x67452301);
+        assert(_byteswap_ushort(0x0123) == 0x2301);
+        static assert(_byteswap_ushort(0x0123) == 0x2301);
+    }
+
+    extern(C)
+    pragma(inline, true)
+    uint _lrotr(uint value, int shift) @safe pure nothrow @nogc
+    {
+        shift &= 31;
+        return cast(uint) ((value >> shift) | (value << (32 - shift)));
+    }
+
+    extern(C)
+    pragma(inline, true)
+    uint _lrotl(uint value, int shift) @safe pure nothrow @nogc
+    {
+        shift &= 31;
+        return cast(uint) ((value << shift) | (value >> (32 - shift)));
+    }
+
+    extern(C)
+    pragma(inline, true)
+    uint _rotr(uint value, int shift) @safe pure nothrow @nogc
+    {
+        shift &= 31;
+        return cast(uint) ((value >> shift) | (value << (32 - shift)));
+    }
+
+    extern(C)
+    pragma(inline, true)
+    uint _rotl(uint value, int shift) @safe pure nothrow @nogc
+    {
+        shift &= 31;
+        return cast(uint) ((value << shift) | (value >> (32 - shift)));
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ulong _rotr64(ulong value, int shift) @safe pure nothrow @nogc
+    {
+        shift &= 63;
+        return cast(ulong) ((value >> shift) | (value << (64 - shift)));
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ulong _rotl64(ulong value, int shift) @safe pure nothrow @nogc
+    {
+        shift &= 63;
+        return cast(ulong) ((value << shift) | (value >> (64 - shift)));
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ushort _rotr16(ushort value, ubyte shift) @safe pure nothrow @nogc
+    {
+        shift &= 15;
+        return cast(ushort) ((value >> shift) | (value << (16 - shift)));
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ushort _rotl16(ushort value, ubyte shift) @safe pure nothrow @nogc
+    {
+        shift &= 15;
+        return cast(ushort) ((value << shift) | (value >> (16 - shift)));
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ubyte _rotr8(ubyte value, ubyte shift) @safe pure nothrow @nogc
+    {
+        shift &= 7;
+        return cast(ubyte) ((value >> shift) | (value << (8 - shift)));
+    }
+
+    extern(C)
+    pragma(inline, true)
+    ubyte _rotl8(ubyte value, ubyte shift) @safe pure nothrow @nogc
+    {
+        shift &= 7;
+        return cast(ubyte) ((value << shift) | (value >> (8 - shift)));
+    }
+
+    @safe pure nothrow @nogc unittest
+    {
+        static bool test()
+        {
+            assert(_lrotr(0x12345678, 8) == 0x78123456);
+            assert(_lrotr(0x12345678, 8 + 32) == 0x78123456);
+            assert(_lrotr(0x12345678, 16) == 0x56781234);
+
+            assert(_lrotl(0x12345678, 8) == 0x34567812);
+            assert(_lrotl(0x12345678, 8 + 32) == 0x34567812);
+            assert(_lrotl(0x12345678, 16) == 0x56781234);
+
+            assert(_rotr(0x12345678, 8) == 0x78123456);
+            assert(_rotr(0x12345678, 8 + 32) == 0x78123456);
+            assert(_rotr(0x12345678, 16) == 0x56781234);
+
+            assert(_rotl(0x12345678, 8) == 0x34567812);
+            assert(_rotl(0x12345678, 8 + 32) == 0x34567812);
+            assert(_rotl(0x12345678, 16) == 0x56781234);
+
+            assert(_rotr64(0x12345678_9ABCDEF0, 8) == 0xF012345678_9ABCDE);
+            assert(_rotr64(0x12345678_9ABCDEF0, 8 + 64) == 0xF0123456_789ABCDE);
+            assert(_rotr64(0x12345678_9ABCDEF0, 16) == 0xDEF01234_56789ABC);
+
+            assert(_rotl64(0x12345678_9ABCDEF0, 8) == 0x345678_9ABCDEF012);
+            assert(_rotl64(0x12345678_9ABCDEF0, 8 + 64) == 0x345678_9ABCDEF012);
+            assert(_rotl64(0x12345678_9ABCDEF0, 16) == 0x56789ABC_DEF01234);
+
+            assert(_rotr16(0x1234, 4) == 0x4123);
+            assert(_rotr16(0x1234, 4 + 16) == 0x4123);
+            assert(_rotr16(0x1234, 8) == 0x3412);
+
+            assert(_rotl16(0x1234, 4) == 0x2341);
+            assert(_rotl16(0x1234, 4 + 16) == 0x2341);
+            assert(_rotl16(0x1234, 8) == 0x3412);
+
+            assert(_rotr8(0b10010110, 2) == 0b10100101);
+            assert(_rotr8(0b10010110, 2 + 8) == 0b10100101);
+            assert(_rotr8(0b10010110, 4) == 0b01101001);
+
+            assert(_rotl8(0b10010110, 2) == 0b01011010);
+            assert(_rotl8(0b10010110, 2 + 8) == 0b01011010);
+            assert(_rotl8(0b10010110, 4) == 0b01101001);
+
+            return true;
+        }
+
+        assert(test());
+        static assert(test());
+    }
+
+    version (AArch64_Or_ARM)
+    {
+        extern(C)
+        pragma(inline, true)
+        void __yield() @safe pure nothrow @nogc
+        {
+            if (__ctfe)
+            {}
+            else
+            {
+                version (LDC)
+                {
+                    llvm_arm_hint(1);
+                }
+                else version (GNU)
+                {
+                    asm @trusted pure nothrow @nogc
+                    {
+                        "yield" : : : "memory";
+                    }
+                }
+                else
+                {
+                    static assert(false);
+                }
+            }
+        }
+
+        @safe pure nothrow @nogc unittest
+        {
+            static bool test()
+            {
+                __yield();
+                return true;
+            }
+
+            assert(test());
+            static assert(test());
+        }
+    }
+}
diff --git a/druntime/src/include/importc_msvc_builtins.h b/druntime/src/include/importc_msvc_builtins.h
new file mode 100644
index 000000000000..d155d96f3e57
--- /dev/null
+++ b/druntime/src/include/importc_msvc_builtins.h
@@ -0,0 +1,31 @@
+/* This file contains reimplementations of some of the intrinsics recognized
+   by the MSVC compiler, for ImportC.
+   To use it, put `#include <importc_msvc_builtins.h>` in the C source-code that intends to use it.
+
+   This header emits its declarations only when `__IMPORTC_MSVC_BUILTINS__` is not defined.
+   This header defines `__IMPORTC_MSVC_BUILTINS__` when it is included.
+
+   Copyright: Copyright D Language Foundation 2024-2024
+   License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
+   Authors: Harry Gillanders
+   Source: $(DRUNTIMESRC importc_msvc_builtins.h) */
+
+#ifndef __IMPORTC_MSVC_BUILTINS__
+#define __IMPORTC_MSVC_BUILTINS__ 1
+
+__import __builtins_msvc;
+
+#if defined(__clang__)
+#define __assume(expression) __builtin_assume(expression)
+#elif defined(__GNUC__)
+#define __assume(expression) do {if (!(expression)) {__builtin_unreachable();}} while (0)
+#else
+#define __assume(expression) __check(expression)
+#endif
+
+#if defined(_M_ARM64) || defined(_M_ARM)
+#define __dmb __builtin_arm_dmb
+#define __dsb __builtin_arm_dsb
+#define __isb __builtin_arm_isb
+#endif
+#endif