From bad4be6e29e141ec331e25259ab27ca4c20f86a3 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 28 Oct 2023 15:39:54 +0200
Subject: [PATCH 01/24] interpret: call caller_location logic the same way
 codegen does, and share some code

---
 src/common.rs | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/common.rs b/src/common.rs
index 9771f44f62cfb..38dba016c23e1 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -433,16 +433,7 @@ impl<'tcx> FunctionCx<'_, '_, 'tcx> {
     // Note: must be kept in sync with get_caller_location from cg_ssa
     pub(crate) fn get_caller_location(&mut self, mut source_info: mir::SourceInfo) -> CValue<'tcx> {
         let span_to_caller_location = |fx: &mut FunctionCx<'_, '_, 'tcx>, span: Span| {
-            use rustc_session::RemapFileNameExt;
-            let topmost = span.ctxt().outer_expn().expansion_cause().unwrap_or(span);
-            let caller = fx.tcx.sess.source_map().lookup_char_pos(topmost.lo());
-            let const_loc = fx.tcx.const_caller_location((
-                rustc_span::symbol::Symbol::intern(
-                    &caller.file.name.for_codegen(&fx.tcx.sess).to_string_lossy(),
-                ),
-                caller.line as u32,
-                caller.col_display as u32 + 1,
-            ));
+            let const_loc = fx.tcx.span_as_caller_location(span);
             crate::constant::codegen_const_value(fx, const_loc, fx.tcx.caller_location_ty())
         };
 

From c6f50902940f3160a929e71c42ac631178304734 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 28 Oct 2023 16:16:15 +0200
Subject: [PATCH 02/24] share the track_caller handling within a mir::Body

---
 src/common.rs | 37 +++++--------------------------------
 1 file changed, 5 insertions(+), 32 deletions(-)

diff --git a/src/common.rs b/src/common.rs
index 38dba016c23e1..63562d335089b 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -430,38 +430,11 @@ impl<'tcx> FunctionCx<'_, '_, 'tcx> {
         }
     }
 
-    // Note: must be kept in sync with get_caller_location from cg_ssa
-    pub(crate) fn get_caller_location(&mut self, mut source_info: mir::SourceInfo) -> CValue<'tcx> {
-        let span_to_caller_location = |fx: &mut FunctionCx<'_, '_, 'tcx>, span: Span| {
-            let const_loc = fx.tcx.span_as_caller_location(span);
-            crate::constant::codegen_const_value(fx, const_loc, fx.tcx.caller_location_ty())
-        };
-
-        // Walk up the `SourceScope`s, in case some of them are from MIR inlining.
-        // If so, the starting `source_info.span` is in the innermost inlined
-        // function, and will be replaced with outer callsite spans as long
-        // as the inlined functions were `#[track_caller]`.
-        loop {
-            let scope_data = &self.mir.source_scopes[source_info.scope];
-
-            if let Some((callee, callsite_span)) = scope_data.inlined {
-                // Stop inside the most nested non-`#[track_caller]` function,
-                // before ever reaching its caller (which is irrelevant).
-                if !callee.def.requires_caller_location(self.tcx) {
-                    return span_to_caller_location(self, source_info.span);
-                }
-                source_info.span = callsite_span;
-            }
-
-            // Skip past all of the parents with `inlined: None`.
-            match scope_data.inlined_parent_scope {
-                Some(parent) => source_info.scope = parent,
-                None => break,
-            }
-        }
-
-        // No inlined `SourceScope`s, or all of them were `#[track_caller]`.
-        self.caller_location.unwrap_or_else(|| span_to_caller_location(self, source_info.span))
+    pub(crate) fn get_caller_location(&mut self, source_info: mir::SourceInfo) -> CValue<'tcx> {
+        self.mir.caller_location_span(source_info, self.caller_location, self.tcx, |span| {
+            let const_loc = self.tcx.span_as_caller_location(span);
+            crate::constant::codegen_const_value(self, const_loc, self.tcx.caller_location_ty())
+        })
     }
 
     pub(crate) fn anonymous_str(&mut self, msg: &str) -> Value {

From 41dcb52153cfb2e3a5eb74af0626e6e984e2ddc4 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sun, 29 Oct 2023 20:30:50 +0000
Subject: [PATCH 03/24] Merge commit 'dde58803fd6cbb270c7a437f36a8a3a29fbef679'
 into sync_cg_clif-2023-10-29

---
 Cargo.toml                 |   4 +-
 patches/stdlib-lock.toml   |  17 ---
 rust-toolchain             |   2 +-
 scripts/setup_rust_fork.sh |   4 +-
 scripts/test_bootstrap.sh  |   4 +-
 src/global_asm.rs          | 113 ++++++++++++++------
 src/inline_asm.rs          | 214 ++-----------------------------------
 src/intrinsics/cpuid.rs    |  74 -------------
 src/intrinsics/mod.rs      |   2 -
 9 files changed, 100 insertions(+), 334 deletions(-)
 delete mode 100644 src/intrinsics/cpuid.rs

diff --git a/Cargo.toml b/Cargo.toml
index b5e6f431123e8..30db10f745715 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,9 +35,9 @@ smallvec = "1.8.1"
 
 [features]
 # Enable features not ready to be enabled when compiling as part of rustc
-unstable-features = ["jit", "inline_asm"]
+unstable-features = ["jit", "inline_asm_sym"]
 jit = ["cranelift-jit", "libloading"]
-inline_asm = []
+inline_asm_sym = []
 
 [package.metadata.rust-analyzer]
 rustc_private = true
diff --git a/patches/stdlib-lock.toml b/patches/stdlib-lock.toml
index f10b4d6b9d4fa..9902bca8eab27 100644
--- a/patches/stdlib-lock.toml
+++ b/patches/stdlib-lock.toml
@@ -40,22 +40,6 @@ version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "56fc6cf8dc8c4158eed8649f9b8b0ea1518eb62b544fe9490d66fa0b349eafe9"
 
-[[package]]
-name = "auxv"
-version = "0.3.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e50430f9beb8effb02399fa81c76eeaa26b05e4f03b09285cad8d079c1af5a3d"
-dependencies = [
- "byteorder",
- "gcc",
-]
-
-[[package]]
-name = "byteorder"
-version = "1.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
-
 [[package]]
 name = "cc"
 version = "1.0.79"
@@ -388,7 +372,6 @@ dependencies = [
 name = "std_detect"
 version = "0.1.5"
 dependencies = [
- "auxv",
  "cfg-if",
  "compiler_builtins",
  "cupid",
diff --git a/rust-toolchain b/rust-toolchain
index 3735ac1c17b14..7e3eaacf8ef08 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1,3 +1,3 @@
 [toolchain]
-channel = "nightly-2023-10-21"
+channel = "nightly-2023-10-29"
 components = ["rust-src", "rustc-dev", "llvm-tools"]
diff --git a/scripts/setup_rust_fork.sh b/scripts/setup_rust_fork.sh
index 3e48fb006dea4..bbb8a010d965f 100644
--- a/scripts/setup_rust_fork.sh
+++ b/scripts/setup_rust_fork.sh
@@ -4,7 +4,9 @@ set -e
 # Compiletest expects all standard library paths to start with /rustc/FAKE_PREFIX.
 # CG_CLIF_STDLIB_REMAP_PATH_PREFIX will cause cg_clif's build system to pass
 # --remap-path-prefix to handle this.
-CG_CLIF_STDLIB_REMAP_PATH_PREFIX=/rustc/FAKE_PREFIX ./y.sh build
+# CG_CLIF_FORCE_GNU_AS will force usage of as instead of the LLVM backend of rustc as we
+# the LLVM backend isn't compiled in here.
+CG_CLIF_FORCE_GNU_AS=1 CG_CLIF_STDLIB_REMAP_PATH_PREFIX=/rustc/FAKE_PREFIX ./y.sh build
 
 echo "[SETUP] Rust fork"
 git clone https://github.com/rust-lang/rust.git || true
diff --git a/scripts/test_bootstrap.sh b/scripts/test_bootstrap.sh
index 791d457993de3..a8f6d7a202486 100755
--- a/scripts/test_bootstrap.sh
+++ b/scripts/test_bootstrap.sh
@@ -11,5 +11,7 @@ rm -r compiler/rustc_codegen_cranelift/{Cargo.*,src}
 cp ../Cargo.* compiler/rustc_codegen_cranelift/
 cp -r ../src compiler/rustc_codegen_cranelift/src
 
-./x.py build --stage 1 library/std
+# CG_CLIF_FORCE_GNU_AS will force usage of as instead of the LLVM backend of rustc as we
+# the LLVM backend isn't compiled in here.
+CG_CLIF_FORCE_GNU_AS=1 ./x.py build --stage 1 library/std
 popd
diff --git a/src/global_asm.rs b/src/global_asm.rs
index 6692d1b85eac0..b14007f4e5232 100644
--- a/src/global_asm.rs
+++ b/src/global_asm.rs
@@ -46,6 +46,13 @@ pub(crate) fn codegen_global_asm_item(tcx: TyCtxt<'_>, global_asm: &mut String,
                             global_asm.push_str(&string);
                         }
                         InlineAsmOperand::SymFn { anon_const } => {
+                            if cfg!(not(feature = "inline_asm_sym")) {
+                                tcx.sess.span_err(
+                                    item.span,
+                                    "asm! and global_asm! sym operands are not yet supported",
+                                );
+                            }
+
                             let ty = tcx.typeck_body(anon_const.body).node_type(anon_const.hir_id);
                             let instance = match ty.kind() {
                                 &ty::FnDef(def_id, args) => Instance::new(def_id, args),
@@ -57,6 +64,13 @@ pub(crate) fn codegen_global_asm_item(tcx: TyCtxt<'_>, global_asm: &mut String,
                             global_asm.push_str(symbol.name);
                         }
                         InlineAsmOperand::SymStatic { path: _, def_id } => {
+                            if cfg!(not(feature = "inline_asm_sym")) {
+                                tcx.sess.span_err(
+                                    item.span,
+                                    "asm! and global_asm! sym operands are not yet supported",
+                                );
+                            }
+
                             let instance = Instance::mono(tcx, def_id).polymorphize(tcx);
                             let symbol = tcx.symbol_name(instance);
                             global_asm.push_str(symbol.name);
@@ -81,22 +95,23 @@ pub(crate) fn codegen_global_asm_item(tcx: TyCtxt<'_>, global_asm: &mut String,
     }
 }
 
-pub(crate) fn asm_supported(tcx: TyCtxt<'_>) -> bool {
-    cfg!(feature = "inline_asm") && !tcx.sess.target.is_like_windows
-}
-
 #[derive(Debug)]
 pub(crate) struct GlobalAsmConfig {
-    asm_enabled: bool,
     assembler: PathBuf,
+    target: String,
     pub(crate) output_filenames: Arc<OutputFilenames>,
 }
 
 impl GlobalAsmConfig {
     pub(crate) fn new(tcx: TyCtxt<'_>) -> Self {
         GlobalAsmConfig {
-            asm_enabled: asm_supported(tcx),
             assembler: crate::toolchain::get_toolchain_binary(tcx.sess, "as"),
+            target: match &tcx.sess.opts.target_triple {
+                rustc_target::spec::TargetTriple::TargetTriple(triple) => triple.clone(),
+                rustc_target::spec::TargetTriple::TargetJson { path_for_rustdoc, .. } => {
+                    path_for_rustdoc.to_str().unwrap().to_owned()
+                }
+            },
             output_filenames: tcx.output_filenames(()).clone(),
         }
     }
@@ -111,21 +126,6 @@ pub(crate) fn compile_global_asm(
         return Ok(None);
     }
 
-    if !config.asm_enabled {
-        if global_asm.contains("__rust_probestack") {
-            return Ok(None);
-        }
-
-        if cfg!(not(feature = "inline_asm")) {
-            return Err(
-                "asm! and global_asm! support is disabled while compiling rustc_codegen_cranelift"
-                    .to_owned(),
-            );
-        } else {
-            return Err("asm! and global_asm! are not yet supported on Windows".to_owned());
-        }
-    }
-
     // Remove all LLVM style comments
     let mut global_asm = global_asm
         .lines()
@@ -134,20 +134,67 @@ pub(crate) fn compile_global_asm(
         .join("\n");
     global_asm.push('\n');
 
-    let output_object_file = config.output_filenames.temp_path(OutputType::Object, Some(cgu_name));
+    let global_asm_object_file = add_file_stem_postfix(
+        config.output_filenames.temp_path(OutputType::Object, Some(cgu_name)),
+        ".asm",
+    );
 
     // Assemble `global_asm`
-    let global_asm_object_file = add_file_stem_postfix(output_object_file, ".asm");
-    let mut child = Command::new(&config.assembler)
-        .arg("-o")
-        .arg(&global_asm_object_file)
-        .stdin(Stdio::piped())
-        .spawn()
-        .expect("Failed to spawn `as`.");
-    child.stdin.take().unwrap().write_all(global_asm.as_bytes()).unwrap();
-    let status = child.wait().expect("Failed to wait for `as`.");
-    if !status.success() {
-        return Err(format!("Failed to assemble `{}`", global_asm));
+    if option_env!("CG_CLIF_FORCE_GNU_AS").is_some() {
+        let mut child = Command::new(&config.assembler)
+            .arg("-o")
+            .arg(&global_asm_object_file)
+            .stdin(Stdio::piped())
+            .spawn()
+            .expect("Failed to spawn `as`.");
+        child.stdin.take().unwrap().write_all(global_asm.as_bytes()).unwrap();
+        let status = child.wait().expect("Failed to wait for `as`.");
+        if !status.success() {
+            return Err(format!("Failed to assemble `{}`", global_asm));
+        }
+    } else {
+        let mut child = Command::new(std::env::current_exe().unwrap())
+            .arg("--target")
+            .arg(&config.target)
+            .arg("--crate-type")
+            .arg("staticlib")
+            .arg("--emit")
+            .arg("obj")
+            .arg("-o")
+            .arg(&global_asm_object_file)
+            .arg("-")
+            .arg("-Abad_asm_style")
+            .arg("-Zcodegen-backend=llvm")
+            .stdin(Stdio::piped())
+            .spawn()
+            .expect("Failed to spawn `as`.");
+        let mut stdin = child.stdin.take().unwrap();
+        stdin
+            .write_all(
+                br####"
+                #![feature(decl_macro, no_core, rustc_attrs)]
+                #![allow(internal_features)]
+                #![no_core]
+                #[rustc_builtin_macro]
+                #[rustc_macro_transparency = "semitransparent"]
+                macro global_asm() { /* compiler built-in */ }
+                global_asm!(r###"
+                "####,
+            )
+            .unwrap();
+        stdin.write_all(global_asm.as_bytes()).unwrap();
+        stdin
+            .write_all(
+                br####"
+                "###);
+                "####,
+            )
+            .unwrap();
+        std::mem::drop(stdin);
+        let status = child.wait().expect("Failed to wait for `as`.");
+        if !status.success() {
+            return Err(format!("Failed to assemble `{}`", global_asm));
+        }
     }
 
     Ok(Some(global_asm_object_file))
diff --git a/src/inline_asm.rs b/src/inline_asm.rs
index 0517c609337b9..331649b2ec24f 100644
--- a/src/inline_asm.rs
+++ b/src/inline_asm.rs
@@ -8,7 +8,6 @@ use rustc_span::sym;
 use rustc_target::asm::*;
 use target_lexicon::BinaryFormat;
 
-use crate::global_asm::asm_supported;
 use crate::prelude::*;
 
 enum CInlineAsmOperand<'tcx> {
@@ -45,208 +44,11 @@ pub(crate) fn codegen_inline_asm<'tcx>(
 ) {
     // FIXME add .eh_frame unwind info directives
 
-    if !asm_supported(fx.tcx) {
-        if template.is_empty() {
-            let destination_block = fx.get_block(destination.unwrap());
-            fx.bcx.ins().jump(destination_block, &[]);
-            return;
-        }
-
-        // Used by panic_abort
-        if template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string()) {
-            fx.bcx.ins().trap(TrapCode::User(1));
-            return;
-        }
-
-        // Used by stdarch
-        if template[0] == InlineAsmTemplatePiece::String("mov ".to_string())
-            && matches!(
-                template[1],
-                InlineAsmTemplatePiece::Placeholder {
-                    operand_idx: 0,
-                    modifier: Some('r'),
-                    span: _
-                }
-            )
-            && template[2] == InlineAsmTemplatePiece::String(", rbx".to_string())
-            && template[3] == InlineAsmTemplatePiece::String("\n".to_string())
-            && template[4] == InlineAsmTemplatePiece::String("cpuid".to_string())
-            && template[5] == InlineAsmTemplatePiece::String("\n".to_string())
-            && template[6] == InlineAsmTemplatePiece::String("xchg ".to_string())
-            && matches!(
-                template[7],
-                InlineAsmTemplatePiece::Placeholder {
-                    operand_idx: 0,
-                    modifier: Some('r'),
-                    span: _
-                }
-            )
-            && template[8] == InlineAsmTemplatePiece::String(", rbx".to_string())
-        {
-            assert_eq!(operands.len(), 4);
-            let (leaf, eax_place) = match operands[1] {
-                InlineAsmOperand::InOut {
-                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
-                    late: _,
-                    ref in_value,
-                    out_place: Some(out_place),
-                } => (
-                    crate::base::codegen_operand(fx, in_value).load_scalar(fx),
-                    crate::base::codegen_place(fx, out_place),
-                ),
-                _ => unreachable!(),
-            };
-            let ebx_place = match operands[0] {
-                InlineAsmOperand::Out {
-                    reg:
-                        InlineAsmRegOrRegClass::RegClass(InlineAsmRegClass::X86(
-                            X86InlineAsmRegClass::reg,
-                        )),
-                    late: _,
-                    place: Some(place),
-                } => crate::base::codegen_place(fx, place),
-                _ => unreachable!(),
-            };
-            let (sub_leaf, ecx_place) = match operands[2] {
-                InlineAsmOperand::InOut {
-                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
-                    late: _,
-                    ref in_value,
-                    out_place: Some(out_place),
-                } => (
-                    crate::base::codegen_operand(fx, in_value).load_scalar(fx),
-                    crate::base::codegen_place(fx, out_place),
-                ),
-                _ => unreachable!(),
-            };
-            let edx_place = match operands[3] {
-                InlineAsmOperand::Out {
-                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
-                    late: _,
-                    place: Some(place),
-                } => crate::base::codegen_place(fx, place),
-                _ => unreachable!(),
-            };
-
-            let (eax, ebx, ecx, edx) = crate::intrinsics::codegen_cpuid_call(fx, leaf, sub_leaf);
-
-            eax_place.write_cvalue(fx, CValue::by_val(eax, fx.layout_of(fx.tcx.types.u32)));
-            ebx_place.write_cvalue(fx, CValue::by_val(ebx, fx.layout_of(fx.tcx.types.u32)));
-            ecx_place.write_cvalue(fx, CValue::by_val(ecx, fx.layout_of(fx.tcx.types.u32)));
-            edx_place.write_cvalue(fx, CValue::by_val(edx, fx.layout_of(fx.tcx.types.u32)));
-            let destination_block = fx.get_block(destination.unwrap());
-            fx.bcx.ins().jump(destination_block, &[]);
-            return;
-        }
-
-        // Used by compiler-builtins
-        if fx.tcx.symbol_name(fx.instance).name.starts_with("___chkstk") {
-            // ___chkstk, ___chkstk_ms and __alloca are only used on Windows
-            crate::trap::trap_unimplemented(fx, "Stack probes are not supported");
-            return;
-        } else if fx.tcx.symbol_name(fx.instance).name == "__alloca" {
-            crate::trap::trap_unimplemented(fx, "Alloca is not supported");
-            return;
-        }
-
-        // Used by core::hint::spin_loop()
-        if template[0]
-            == InlineAsmTemplatePiece::String(".insn i 0x0F, 0, x0, x0, 0x010".to_string())
-            && template.len() == 1
-        {
-            let destination_block = fx.get_block(destination.unwrap());
-            fx.bcx.ins().jump(destination_block, &[]);
-            return;
-        }
-
-        // Used by measureme
-        if template[0] == InlineAsmTemplatePiece::String("xor %eax, %eax".to_string())
-            && template[1] == InlineAsmTemplatePiece::String("\n".to_string())
-            && template[2] == InlineAsmTemplatePiece::String("mov %rbx, ".to_string())
-            && matches!(
-                template[3],
-                InlineAsmTemplatePiece::Placeholder {
-                    operand_idx: 0,
-                    modifier: Some('r'),
-                    span: _
-                }
-            )
-            && template[4] == InlineAsmTemplatePiece::String("\n".to_string())
-            && template[5] == InlineAsmTemplatePiece::String("cpuid".to_string())
-            && template[6] == InlineAsmTemplatePiece::String("\n".to_string())
-            && template[7] == InlineAsmTemplatePiece::String("mov ".to_string())
-            && matches!(
-                template[8],
-                InlineAsmTemplatePiece::Placeholder {
-                    operand_idx: 0,
-                    modifier: Some('r'),
-                    span: _
-                }
-            )
-            && template[9] == InlineAsmTemplatePiece::String(", %rbx".to_string())
-        {
-            let destination_block = fx.get_block(destination.unwrap());
-            fx.bcx.ins().jump(destination_block, &[]);
-            return;
-        } else if template[0] == InlineAsmTemplatePiece::String("rdpmc".to_string()) {
-            // Return zero dummy values for all performance counters
-            match operands[0] {
-                InlineAsmOperand::In {
-                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
-                    value: _,
-                } => {}
-                _ => unreachable!(),
-            };
-            let lo = match operands[1] {
-                InlineAsmOperand::Out {
-                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
-                    late: true,
-                    place: Some(place),
-                } => crate::base::codegen_place(fx, place),
-                _ => unreachable!(),
-            };
-            let hi = match operands[2] {
-                InlineAsmOperand::Out {
-                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
-                    late: true,
-                    place: Some(place),
-                } => crate::base::codegen_place(fx, place),
-                _ => unreachable!(),
-            };
-
-            let u32_layout = fx.layout_of(fx.tcx.types.u32);
-            let zero = fx.bcx.ins().iconst(types::I32, 0);
-            lo.write_cvalue(fx, CValue::by_val(zero, u32_layout));
-            hi.write_cvalue(fx, CValue::by_val(zero, u32_layout));
-
-            let destination_block = fx.get_block(destination.unwrap());
-            fx.bcx.ins().jump(destination_block, &[]);
-            return;
-        } else if template[0] == InlineAsmTemplatePiece::String("lock xadd ".to_string())
-            && matches!(
-                template[1],
-                InlineAsmTemplatePiece::Placeholder { operand_idx: 1, modifier: None, span: _ }
-            )
-            && template[2] == InlineAsmTemplatePiece::String(", (".to_string())
-            && matches!(
-                template[3],
-                InlineAsmTemplatePiece::Placeholder { operand_idx: 0, modifier: None, span: _ }
-            )
-            && template[4] == InlineAsmTemplatePiece::String(")".to_string())
-        {
-            let destination_block = fx.get_block(destination.unwrap());
-            fx.bcx.ins().jump(destination_block, &[]);
-            return;
-        }
-
-        if cfg!(not(feature = "inline_asm")) {
-            fx.tcx.sess.span_err(
-                span,
-                "asm! and global_asm! support is disabled while compiling rustc_codegen_cranelift",
-            );
-        } else {
-            fx.tcx.sess.span_err(span, "asm! and global_asm! are not yet supported on Windows");
-        }
+    // Used by panic_abort on Windows, but uses a syntax which only happens to work with
+    // asm!() by accident and breaks with the GNU assembler as well as global_asm!() for
+    // the LLVM backend.
+    if template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string()) {
+        fx.bcx.ins().trap(TrapCode::User(1));
         return;
     }
 
@@ -280,6 +82,12 @@ pub(crate) fn codegen_inline_asm<'tcx>(
                 CInlineAsmOperand::Const { value }
             }
             InlineAsmOperand::SymFn { ref value } => {
+                if cfg!(not(feature = "inline_asm_sym")) {
+                    fx.tcx
+                        .sess
+                        .span_err(span, "asm! and global_asm! sym operands are not yet supported");
+                }
+
                 let const_ = fx.monomorphize(value.const_);
                 if let ty::FnDef(def_id, args) = *const_.ty().kind() {
                     let instance = ty::Instance::resolve_for_fn_ptr(
diff --git a/src/intrinsics/cpuid.rs b/src/intrinsics/cpuid.rs
deleted file mode 100644
index 5120b89c4e8b0..0000000000000
--- a/src/intrinsics/cpuid.rs
+++ /dev/null
@@ -1,74 +0,0 @@
-//! Emulation of a subset of the cpuid x86 instruction.
-
-use crate::prelude::*;
-
-/// Emulates a subset of the cpuid x86 instruction.
-///
-/// This emulates an intel cpu with sse and sse2 support, but which doesn't support anything else.
-pub(crate) fn codegen_cpuid_call<'tcx>(
-    fx: &mut FunctionCx<'_, '_, 'tcx>,
-    leaf: Value,
-    _sub_leaf: Value,
-) -> (Value, Value, Value, Value) {
-    let leaf_0 = fx.bcx.create_block();
-    let leaf_1 = fx.bcx.create_block();
-    let leaf_7 = fx.bcx.create_block();
-    let leaf_8000_0000 = fx.bcx.create_block();
-    let leaf_8000_0001 = fx.bcx.create_block();
-    let unsupported_leaf = fx.bcx.create_block();
-
-    let dest = fx.bcx.create_block();
-    let eax = fx.bcx.append_block_param(dest, types::I32);
-    let ebx = fx.bcx.append_block_param(dest, types::I32);
-    let ecx = fx.bcx.append_block_param(dest, types::I32);
-    let edx = fx.bcx.append_block_param(dest, types::I32);
-
-    let mut switch = cranelift_frontend::Switch::new();
-    switch.set_entry(0, leaf_0);
-    switch.set_entry(1, leaf_1);
-    switch.set_entry(7, leaf_7);
-    switch.set_entry(0x8000_0000, leaf_8000_0000);
-    switch.set_entry(0x8000_0001, leaf_8000_0001);
-    switch.emit(&mut fx.bcx, leaf, unsupported_leaf);
-
-    fx.bcx.switch_to_block(leaf_0);
-    let max_basic_leaf = fx.bcx.ins().iconst(types::I32, 1);
-    let vend0 = fx.bcx.ins().iconst(types::I32, i64::from(u32::from_le_bytes(*b"Genu")));
-    let vend2 = fx.bcx.ins().iconst(types::I32, i64::from(u32::from_le_bytes(*b"ineI")));
-    let vend1 = fx.bcx.ins().iconst(types::I32, i64::from(u32::from_le_bytes(*b"ntel")));
-    fx.bcx.ins().jump(dest, &[max_basic_leaf, vend0, vend1, vend2]);
-
-    fx.bcx.switch_to_block(leaf_1);
-    let cpu_signature = fx.bcx.ins().iconst(types::I32, 0);
-    let additional_information = fx.bcx.ins().iconst(types::I32, 0);
-    let ecx_features = fx.bcx.ins().iconst(types::I32, 0);
-    let edx_features = fx.bcx.ins().iconst(types::I32, 1 << 25 /* sse */ | 1 << 26 /* sse2 */);
-    fx.bcx.ins().jump(dest, &[cpu_signature, additional_information, ecx_features, edx_features]);
-
-    fx.bcx.switch_to_block(leaf_7);
-    // This leaf technically has subleaves, but we just return zero for all subleaves.
-    let zero = fx.bcx.ins().iconst(types::I32, 0);
-    fx.bcx.ins().jump(dest, &[zero, zero, zero, zero]);
-
-    fx.bcx.switch_to_block(leaf_8000_0000);
-    let extended_max_basic_leaf = fx.bcx.ins().iconst(types::I32, 0);
-    let zero = fx.bcx.ins().iconst(types::I32, 0);
-    fx.bcx.ins().jump(dest, &[extended_max_basic_leaf, zero, zero, zero]);
-
-    fx.bcx.switch_to_block(leaf_8000_0001);
-    let zero = fx.bcx.ins().iconst(types::I32, 0);
-    let proc_info_ecx = fx.bcx.ins().iconst(types::I32, 0);
-    let proc_info_edx = fx.bcx.ins().iconst(types::I32, 0);
-    fx.bcx.ins().jump(dest, &[zero, zero, proc_info_ecx, proc_info_edx]);
-
-    fx.bcx.switch_to_block(unsupported_leaf);
-    crate::trap::trap_unimplemented(
-        fx,
-        "__cpuid_count arch intrinsic doesn't yet support specified leaf",
-    );
-
-    fx.bcx.switch_to_block(dest);
-    fx.bcx.ins().nop();
-
-    (eax, ebx, ecx, edx)
-}
diff --git a/src/intrinsics/mod.rs b/src/intrinsics/mod.rs
index e94091e6a25eb..83d5d53624eb1 100644
--- a/src/intrinsics/mod.rs
+++ b/src/intrinsics/mod.rs
@@ -12,7 +12,6 @@ macro_rules! intrinsic_args {
     }
 }
 
-mod cpuid;
 mod llvm;
 mod llvm_aarch64;
 mod llvm_x86;
@@ -25,7 +24,6 @@ use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths};
 use rustc_middle::ty::GenericArgsRef;
 use rustc_span::symbol::{kw, sym, Symbol};
 
-pub(crate) use self::cpuid::codegen_cpuid_call;
 pub(crate) use self::llvm::codegen_llvm_intrinsic_call;
 use crate::prelude::*;
 

From 9a33f82140c6da6e5808253309c674554b93e9fe Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Mon, 30 Oct 2023 15:21:34 +0000
Subject: [PATCH 04/24] Remove inline asm support from the list of limitations

#1403 extended support to all targets supported by cg_clif
---
 Readme.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Readme.md b/Readme.md
index 5664cbe7d4fac..1a2b2bbc58812 100644
--- a/Readme.md
+++ b/Readme.md
@@ -76,8 +76,6 @@ configuration options.
 
 ## Not yet supported
 
-* Inline assembly ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1041))
-    * On UNIX there is support for invoking an external assembler for `global_asm!` and `asm!`.
 * SIMD ([tracked here](https://github.com/rust-lang/rustc_codegen_cranelift/issues/171), `std::simd` fully works, `std::arch` is partially supported)
 * Unwinding on panics ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1677), `-Cpanic=abort` is enabled by default)
 

From 48ca2d9703742149aa33b3f84ae933d063213d19 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Tue, 31 Oct 2023 18:38:31 +0000
Subject: [PATCH 05/24] Implement llvm.fma.v* intrinsics

cc #1405
---
 src/intrinsics/llvm.rs | 15 +++++++++++++++
 src/intrinsics/mod.rs  | 29 +++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/src/intrinsics/llvm.rs b/src/intrinsics/llvm.rs
index c169476099806..e9b7daf14924d 100644
--- a/src/intrinsics/llvm.rs
+++ b/src/intrinsics/llvm.rs
@@ -51,6 +51,21 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>(
             });
         }
 
+        _ if intrinsic.starts_with("llvm.fma.v") => {
+            intrinsic_args!(fx, args => (x,y,z); intrinsic);
+
+            simd_trio_for_each_lane(
+                fx,
+                x,
+                y,
+                z,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, lane_x, lane_y, lane_z| {
+                    fx.bcx.ins().fma(lane_x, lane_y, lane_z)
+                },
+            );
+        }
+
         _ => {
             fx.tcx
                 .sess
diff --git a/src/intrinsics/mod.rs b/src/intrinsics/mod.rs
index 83d5d53624eb1..58b31a534328e 100644
--- a/src/intrinsics/mod.rs
+++ b/src/intrinsics/mod.rs
@@ -132,6 +132,35 @@ fn simd_pair_for_each_lane<'tcx>(
     }
 }
 
+fn simd_trio_for_each_lane<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    x: CValue<'tcx>,
+    y: CValue<'tcx>,
+    z: CValue<'tcx>,
+    ret: CPlace<'tcx>,
+    f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value, Value) -> Value,
+) {
+    assert_eq!(x.layout(), y.layout());
+    let layout = x.layout();
+
+    let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+    let lane_layout = fx.layout_of(lane_ty);
+    let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+    let ret_lane_layout = fx.layout_of(ret_lane_ty);
+    assert_eq!(lane_count, ret_lane_count);
+
+    for lane_idx in 0..lane_count {
+        let x_lane = x.value_lane(fx, lane_idx).load_scalar(fx);
+        let y_lane = y.value_lane(fx, lane_idx).load_scalar(fx);
+        let z_lane = z.value_lane(fx, lane_idx).load_scalar(fx);
+
+        let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, x_lane, y_lane, z_lane);
+        let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+        ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane);
+    }
+}
+
 fn simd_reduce<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     val: CValue<'tcx>,

From 03c9acdd8fec7039c91c9523e4cfde761fb2a577 Mon Sep 17 00:00:00 2001
From: George Bateman <george.bateman16@gmail.com>
Date: Tue, 15 Aug 2023 20:10:45 +0100
Subject: [PATCH 06/24] Support enum variants in offset_of!

---
 src/base.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/base.rs b/src/base.rs
index 80e7c5bd9edbc..91b1547cb6ea6 100644
--- a/src/base.rs
+++ b/src/base.rs
@@ -766,7 +766,7 @@ fn codegen_stmt<'tcx>(
                         NullOp::SizeOf => layout.size.bytes(),
                         NullOp::AlignOf => layout.align.abi.bytes(),
                         NullOp::OffsetOf(fields) => {
-                            layout.offset_of_subfield(fx, fields.iter().map(|f| f.index())).bytes()
+                            layout.offset_of_subfield(fx, fields.iter()).bytes()
                         }
                     };
                     let val = CValue::by_val(

From 04f1024ecb8716e3f23b580af48d50428b05ad6a Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Thu, 2 Nov 2023 09:44:36 +0000
Subject: [PATCH 07/24] Rustup to rustc 1.75.0-nightly (75b064d26 2023-11-01)

---
 rust-toolchain | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust-toolchain b/rust-toolchain
index 7e3eaacf8ef08..54683e1ebb18b 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1,3 +1,3 @@
 [toolchain]
-channel = "nightly-2023-10-29"
+channel = "nightly-2023-11-02"
 components = ["rust-src", "rustc-dev", "llvm-tools"]

From c04ceb4342b7362e9f7620f956f1098bdab24128 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Thu, 2 Nov 2023 09:52:21 +0000
Subject: [PATCH 08/24] Fix workaround for the `int $$0x29` issue to not crash
 on empty inline asm

---
 src/inline_asm.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/inline_asm.rs b/src/inline_asm.rs
index 331649b2ec24f..78212e8a4cc08 100644
--- a/src/inline_asm.rs
+++ b/src/inline_asm.rs
@@ -47,7 +47,9 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     // Used by panic_abort on Windows, but uses a syntax which only happens to work with
     // asm!() by accident and breaks with the GNU assembler as well as global_asm!() for
     // the LLVM backend.
-    if template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string()) {
+    if template.len() == 1
+        && template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string())
+    {
         fx.bcx.ins().trap(TrapCode::User(1));
         return;
     }

From ef3703694ff6b19d38aff83477bb2ad38f5a58d8 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Thu, 2 Nov 2023 10:27:56 +0000
Subject: [PATCH 09/24] Disable a couple of rustc tests which are broken due to
 a rustc bug

---
 scripts/test_rustc_tests.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scripts/test_rustc_tests.sh b/scripts/test_rustc_tests.sh
index b485f2571cc7e..dcc8c7f435605 100755
--- a/scripts/test_rustc_tests.sh
+++ b/scripts/test_rustc_tests.sh
@@ -147,6 +147,11 @@ rm tests/ui/process/nofile-limit.rs # TODO some AArch64 linking issue
 
 rm tests/ui/stdio-is-blocking.rs # really slow with unoptimized libstd
 
+# rustc bugs
+# ==========
+# https://github.com/rust-lang/rust/pull/116447#issuecomment-1790451463
+rm tests/ui/coroutine/gen_block_*.rs
+
 cp ../dist/bin/rustdoc-clif ../dist/bin/rustdoc # some tests expect bin/rustdoc to exist
 
 # prevent $(RUSTDOC) from picking up the sysroot built by x.py. It conflicts with the one used by

From 909513ef74d790b29d9c79eab775b1fca62ecd04 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Thu, 2 Nov 2023 15:32:44 +0000
Subject: [PATCH 10/24] Use Value instead of CValue in CInlineAsmOperand

---
 src/inline_asm.rs | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/inline_asm.rs b/src/inline_asm.rs
index 78212e8a4cc08..ee7d5bcc9f7cd 100644
--- a/src/inline_asm.rs
+++ b/src/inline_asm.rs
@@ -13,7 +13,7 @@ use crate::prelude::*;
 enum CInlineAsmOperand<'tcx> {
     In {
         reg: InlineAsmRegOrRegClass,
-        value: CValue<'tcx>,
+        value: Value,
     },
     Out {
         reg: InlineAsmRegOrRegClass,
@@ -23,7 +23,7 @@ enum CInlineAsmOperand<'tcx> {
     InOut {
         reg: InlineAsmRegOrRegClass,
         _late: bool,
-        in_value: CValue<'tcx>,
+        in_value: Value,
         out_place: Option<CPlace<'tcx>>,
     },
     Const {
@@ -57,9 +57,10 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     let operands = operands
         .into_iter()
         .map(|operand| match *operand {
-            InlineAsmOperand::In { reg, ref value } => {
-                CInlineAsmOperand::In { reg, value: crate::base::codegen_operand(fx, value) }
-            }
+            InlineAsmOperand::In { reg, ref value } => CInlineAsmOperand::In {
+                reg,
+                value: crate::base::codegen_operand(fx, value).load_scalar(fx),
+            },
             InlineAsmOperand::Out { reg, late, ref place } => CInlineAsmOperand::Out {
                 reg,
                 late,
@@ -69,7 +70,7 @@ pub(crate) fn codegen_inline_asm<'tcx>(
                 CInlineAsmOperand::InOut {
                     reg,
                     _late: late,
-                    in_value: crate::base::codegen_operand(fx, in_value),
+                    in_value: crate::base::codegen_operand(fx, in_value).load_scalar(fx),
                     out_place: out_place.map(|place| crate::base::codegen_place(fx, place)),
                 }
             }
@@ -167,7 +168,7 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     for (i, operand) in operands.iter().enumerate() {
         match operand {
             CInlineAsmOperand::In { reg: _, value } => {
-                inputs.push((asm_gen.stack_slots_input[i].unwrap(), value.load_scalar(fx)));
+                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
             }
             CInlineAsmOperand::Out { reg: _, late: _, place } => {
                 if let Some(place) = place {
@@ -175,7 +176,7 @@ pub(crate) fn codegen_inline_asm<'tcx>(
                 }
             }
             CInlineAsmOperand::InOut { reg: _, _late: _, in_value, out_place } => {
-                inputs.push((asm_gen.stack_slots_input[i].unwrap(), in_value.load_scalar(fx)));
+                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *in_value));
                 if let Some(out_place) = out_place {
                     outputs.push((asm_gen.stack_slots_output[i].unwrap(), *out_place));
                 }

From f6a8c3afb5b2c0cb1be044b9dc4368bfc8403991 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Thu, 2 Nov 2023 16:03:26 +0000
Subject: [PATCH 11/24] Add real implementation of _xgetbv()

---
 src/inline_asm.rs          | 80 ++++++++++++++++++++++++++++++++++++++
 src/intrinsics/llvm_x86.rs | 11 ++----
 2 files changed, 83 insertions(+), 8 deletions(-)

diff --git a/src/inline_asm.rs b/src/inline_asm.rs
index ee7d5bcc9f7cd..ce0eecca8a8ba 100644
--- a/src/inline_asm.rs
+++ b/src/inline_asm.rs
@@ -729,3 +729,83 @@ fn call_inline_asm<'tcx>(
         place.write_cvalue(fx, CValue::by_val(value, place.layout()));
     }
 }
+
+pub(crate) fn codegen_xgetbv<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    xcr_no: Value,
+    ret: CPlace<'tcx>,
+) {
+    // FIXME add .eh_frame unwind info directives
+
+    let operands = vec![
+        CInlineAsmOperand::In {
+            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
+            value: xcr_no,
+        },
+        CInlineAsmOperand::Out {
+            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
+            late: true,
+            place: Some(ret),
+        },
+        CInlineAsmOperand::Out {
+            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
+            late: true,
+            place: None,
+        },
+    ];
+    let options = InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM;
+
+    let mut inputs = Vec::new();
+    let mut outputs = Vec::new();
+
+    let mut asm_gen = InlineAssemblyGenerator {
+        tcx: fx.tcx,
+        arch: fx.tcx.sess.asm_arch.unwrap(),
+        enclosing_def_id: fx.instance.def_id(),
+        template: &[InlineAsmTemplatePiece::String(
+            "
+            xgetbv
+            // out = rdx << 32 | rax
+            shl rdx, 32
+            or rax, rdx
+            "
+            .to_string(),
+        )],
+        operands: &operands,
+        options,
+        registers: Vec::new(),
+        stack_slots_clobber: Vec::new(),
+        stack_slots_input: Vec::new(),
+        stack_slots_output: Vec::new(),
+        stack_slot_size: Size::from_bytes(0),
+    };
+    asm_gen.allocate_registers();
+    asm_gen.allocate_stack_slots();
+
+    let inline_asm_index = fx.cx.inline_asm_index.get();
+    fx.cx.inline_asm_index.set(inline_asm_index + 1);
+    let asm_name = format!(
+        "__inline_asm_{}_n{}",
+        fx.cx.cgu_name.as_str().replace('.', "__").replace('-', "_"),
+        inline_asm_index
+    );
+
+    let generated_asm = asm_gen.generate_asm_wrapper(&asm_name);
+    fx.cx.global_asm.push_str(&generated_asm);
+
+    for (i, operand) in operands.iter().enumerate() {
+        match operand {
+            CInlineAsmOperand::In { reg: _, value } => {
+                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
+            }
+            CInlineAsmOperand::Out { reg: _, late: _, place } => {
+                if let Some(place) = place {
+                    outputs.push((asm_gen.stack_slots_output[i].unwrap(), *place));
+                }
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    call_inline_asm(fx, &asm_name, asm_gen.stack_slot_size, inputs, outputs);
+}
diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs
index ea5997a14bb74..6fd85ab8bf1d5 100644
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@@ -20,16 +20,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
 
         // Used by is_x86_feature_detected!();
         "llvm.x86.xgetbv" => {
-            // FIXME use the actual xgetbv instruction
-            intrinsic_args!(fx, args => (v); intrinsic);
+            intrinsic_args!(fx, args => (xcr_no); intrinsic);
 
-            let v = v.load_scalar(fx);
+            let xcr_no = xcr_no.load_scalar(fx);
 
-            // As of writing on XCR0 exists
-            fx.bcx.ins().trapnz(v, TrapCode::UnreachableCodeReached);
-
-            let res = fx.bcx.ins().iconst(types::I64, 1 /* bit 0 must be set */);
-            ret.write_cvalue(fx, CValue::by_val(res, fx.layout_of(fx.tcx.types.i64)));
+            crate::inline_asm::codegen_xgetbv(fx, xcr_no, ret);
         }
 
         "llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {

From 8eca01f4b6e09946baa26435894251e4d441b794 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Tue, 3 Oct 2023 13:54:17 +1100
Subject: [PATCH 12/24] Remove support for compiler plugins.

They've been deprecated for four years.

This commit includes the following changes.
- It eliminates the `rustc_plugin_impl` crate.
- It changes the language used for lints in
  `compiler/rustc_driver_impl/src/lib.rs` and
  `compiler/rustc_lint/src/context.rs`. External lints are now called
  "loaded" lints, rather than "plugins" to avoid confusion with the old
  plugins. This only has a tiny effect on the output of `-W help`.
- E0457 and E0498 are no longer used.
- E0463 is narrowed, now only relating to unfound crates, not plugins.
- The `plugin` feature was moved from "active" to "removed".
- It removes the entire plugins chapter from the unstable book.
- It removes quite a few tests, mostly all of those in
  `tests/ui-fulldeps/plugin/`.

Closes #29597.
---
 scripts/test_rustc_tests.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/test_rustc_tests.sh b/scripts/test_rustc_tests.sh
index b485f2571cc7e..a299b6de6b1cd 100755
--- a/scripts/test_rustc_tests.sh
+++ b/scripts/test_rustc_tests.sh
@@ -68,7 +68,6 @@ rm -r tests/run-make/split-debuginfo # same
 rm -r tests/run-make/symbols-include-type-name # --emit=asm not supported
 rm -r tests/run-make/target-specs # i686 not supported by Cranelift
 rm -r tests/run-make/mismatching-target-triples # same
-rm -r tests/run-make/use-extern-for-plugins # same
 rm tests/ui/asm/x86_64/issue-82869.rs # vector regs in inline asm not yet supported
 rm tests/ui/asm/x86_64/issue-96797.rs # const and sym inline asm operands don't work entirely correctly
 

From 1f09bae6a8c5fe82587f3bdf6d4d271823b5c0d5 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 4 Nov 2023 19:05:50 +0000
Subject: [PATCH 13/24] Implement min/max neon intrisics

---
 build_system/tests.rs          |   1 +
 config.txt                     |   1 +
 example/neon.rs                | 155 +++++++++++++++++++++++++++++++++
 src/intrinsics/llvm_aarch64.rs |  91 +++++++++++++++----
 src/intrinsics/mod.rs          |  30 +++++++
 5 files changed, 259 insertions(+), 19 deletions(-)
 create mode 100644 example/neon.rs

diff --git a/build_system/tests.rs b/build_system/tests.rs
index 1e24d1b113fe6..10736ff9a55c8 100644
--- a/build_system/tests.rs
+++ b/build_system/tests.rs
@@ -99,6 +99,7 @@ const BASE_SYSROOT_SUITE: &[TestCase] = &[
     TestCase::build_bin_and_run("aot.mod_bench", "example/mod_bench.rs", &[]),
     TestCase::build_bin_and_run("aot.issue-72793", "example/issue-72793.rs", &[]),
     TestCase::build_bin("aot.issue-59326", "example/issue-59326.rs"),
+    TestCase::build_bin_and_run("aot.neon", "example/neon.rs", &[]),
 ];
 
 pub(crate) static RAND_REPO: GitRepo = GitRepo::github(
diff --git a/config.txt b/config.txt
index 7ff805e58d967..2ccdc7d78748a 100644
--- a/config.txt
+++ b/config.txt
@@ -42,6 +42,7 @@ aot.float-minmax-pass
 aot.mod_bench
 aot.issue-72793
 aot.issue-59326
+aot.neon
 
 testsuite.extended_sysroot
 test.rust-random/rand
diff --git a/example/neon.rs b/example/neon.rs
new file mode 100644
index 0000000000000..0e23e862df1fa
--- /dev/null
+++ b/example/neon.rs
@@ -0,0 +1,155 @@
+// Most of these tests are copied from https://github.com/japaric/stdsimd/blob/0f4413d01c4f0c3ffbc5a69e9a37fbc7235b31a9/coresimd/arm/neon.rs
+
+#![feature(portable_simd)]
+use std::arch::aarch64::*;
+use std::mem::transmute;
+use std::simd::*;
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_s8() {
+    let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]);
+    let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
+    let e = i8x8::from([-2, -4, 5, 7, 0, 2, 4, 6]);
+    let r: i8x8 = transmute(vpmin_s8(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_s16() {
+    let a = i16x4::from([1, 2, 3, -4]);
+    let b = i16x4::from([0, 3, 2, 5]);
+    let e = i16x4::from([1, -4, 0, 2]);
+    let r: i16x4 = transmute(vpmin_s16(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_s32() {
+    let a = i32x2::from([1, -2]);
+    let b = i32x2::from([0, 3]);
+    let e = i32x2::from([-2, 0]);
+    let r: i32x2 = transmute(vpmin_s32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
+    let e = u8x8::from([1, 3, 5, 7, 0, 2, 4, 6]);
+    let r: u8x8 = transmute(vpmin_u8(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_u16() {
+    let a = u16x4::from([1, 2, 3, 4]);
+    let b = u16x4::from([0, 3, 2, 5]);
+    let e = u16x4::from([1, 3, 0, 2]);
+    let r: u16x4 = transmute(vpmin_u16(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_u32() {
+    let a = u32x2::from([1, 2]);
+    let b = u32x2::from([0, 3]);
+    let e = u32x2::from([1, 0]);
+    let r: u32x2 = transmute(vpmin_u32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmin_f32() {
+    let a = f32x2::from([1., -2.]);
+    let b = f32x2::from([0., 3.]);
+    let e = f32x2::from([-2., 0.]);
+    let r: f32x2 = transmute(vpmin_f32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_s8() {
+    let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]);
+    let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
+    let e = i8x8::from([1, 3, 6, 8, 3, 5, 7, 9]);
+    let r: i8x8 = transmute(vpmax_s8(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_s16() {
+    let a = i16x4::from([1, 2, 3, -4]);
+    let b = i16x4::from([0, 3, 2, 5]);
+    let e = i16x4::from([2, 3, 3, 5]);
+    let r: i16x4 = transmute(vpmax_s16(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_s32() {
+    let a = i32x2::from([1, -2]);
+    let b = i32x2::from([0, 3]);
+    let e = i32x2::from([1, 3]);
+    let r: i32x2 = transmute(vpmax_s32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
+    let e = u8x8::from([2, 4, 6, 8, 3, 5, 7, 9]);
+    let r: u8x8 = transmute(vpmax_u8(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_u16() {
+    let a = u16x4::from([1, 2, 3, 4]);
+    let b = u16x4::from([0, 3, 2, 5]);
+    let e = u16x4::from([2, 4, 3, 5]);
+    let r: u16x4 = transmute(vpmax_u16(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_u32() {
+    let a = u32x2::from([1, 2]);
+    let b = u32x2::from([0, 3]);
+    let e = u32x2::from([2, 3]);
+    let r: u32x2 = transmute(vpmax_u32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpmax_f32() {
+    let a = f32x2::from([1., -2.]);
+    let b = f32x2::from([0., 3.]);
+    let e = f32x2::from([1., 3.]);
+    let r: f32x2 = transmute(vpmax_f32(transmute(a), transmute(b)));
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+fn main() {
+    unsafe {
+        test_vpmin_s8();
+        test_vpmin_s16();
+        test_vpmin_s32();
+        test_vpmin_u8();
+        test_vpmin_u16();
+        test_vpmin_u32();
+        test_vpmin_f32();
+        test_vpmax_s8();
+        test_vpmax_s16();
+        test_vpmax_s32();
+        test_vpmax_u8();
+        test_vpmax_u16();
+        test_vpmax_u32();
+        test_vpmax_f32();
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+fn main() {}
diff --git a/src/intrinsics/llvm_aarch64.rs b/src/intrinsics/llvm_aarch64.rs
index 0c211a06dc4a7..ed318a89fa05f 100644
--- a/src/intrinsics/llvm_aarch64.rs
+++ b/src/intrinsics/llvm_aarch64.rs
@@ -156,6 +156,78 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             });
         }
 
+        _ if intrinsic.starts_with("llvm.aarch64.neon.umaxp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umax(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.smaxp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smax(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.uminp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umin(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.sminp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smin(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.fminp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmin(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.fmaxp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmax(x_lane, y_lane),
+            );
+        }
+
         // FIXME generalize vector types
         "llvm.aarch64.neon.tbl1.v16i8" => {
             intrinsic_args!(fx, args => (t, idx); intrinsic);
@@ -172,25 +244,6 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             }
         }
 
-        // FIXME generalize vector types
-        "llvm.aarch64.neon.umaxp.v16i8" => {
-            intrinsic_args!(fx, args => (a, b); intrinsic);
-
-            // FIXME add helper for horizontal pairwise operations
-            for i in 0..8 {
-                let lane1 = a.value_lane(fx, i * 2).load_scalar(fx);
-                let lane2 = a.value_lane(fx, i * 2 + 1).load_scalar(fx);
-                let res = fx.bcx.ins().umax(lane1, lane2);
-                ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
-            }
-            for i in 0..8 {
-                let lane1 = b.value_lane(fx, i * 2).load_scalar(fx);
-                let lane2 = b.value_lane(fx, i * 2 + 1).load_scalar(fx);
-                let res = fx.bcx.ins().umax(lane1, lane2);
-                ret.place_lane(fx, 8 + i).to_ptr().store(fx, res, MemFlags::trusted());
-            }
-        }
-
         /*
         _ if intrinsic.starts_with("llvm.aarch64.neon.sshl.v")
             || intrinsic.starts_with("llvm.aarch64.neon.sqshl.v")
diff --git a/src/intrinsics/mod.rs b/src/intrinsics/mod.rs
index 58b31a534328e..bfeeb117ff5b3 100644
--- a/src/intrinsics/mod.rs
+++ b/src/intrinsics/mod.rs
@@ -132,6 +132,36 @@ fn simd_pair_for_each_lane<'tcx>(
     }
 }
 
+fn simd_horizontal_pair_for_each_lane<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    x: CValue<'tcx>,
+    y: CValue<'tcx>,
+    ret: CPlace<'tcx>,
+    f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value) -> Value,
+) {
+    assert_eq!(x.layout(), y.layout());
+    let layout = x.layout();
+
+    let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+    let lane_layout = fx.layout_of(lane_ty);
+    let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+    let ret_lane_layout = fx.layout_of(ret_lane_ty);
+    assert_eq!(lane_count, ret_lane_count);
+
+    for lane_idx in 0..lane_count {
+        let src = if lane_idx < (lane_count / 2) { x } else { y };
+        let src_idx = lane_idx % (lane_count / 2);
+
+        let lhs_lane = src.value_lane(fx, src_idx * 2).load_scalar(fx);
+        let rhs_lane = src.value_lane(fx, src_idx * 2 + 1).load_scalar(fx);
+
+        let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, lhs_lane, rhs_lane);
+        let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+        ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane);
+    }
+}
+
 fn simd_trio_for_each_lane<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     x: CValue<'tcx>,

From 88c2e7896b991e503b889da25e5c6a948586b16b Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 4 Nov 2023 19:11:15 +0000
Subject: [PATCH 14/24] Implement aarch64 addp intrinsics

---
 example/neon.rs                | 56 ++++++++++++++++++++++++++++++++++
 src/intrinsics/llvm_aarch64.rs | 12 ++++++++
 2 files changed, 68 insertions(+)

diff --git a/example/neon.rs b/example/neon.rs
index 0e23e862df1fa..6ea053d0b0046 100644
--- a/example/neon.rs
+++ b/example/neon.rs
@@ -131,6 +131,55 @@ unsafe fn test_vpmax_f32() {
     assert_eq!(r, e);
 }
 
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_s16() {
+    let a = i16x4::from([1, 2, 3, 4]);
+    let b = i16x4::from([0, -1, -2, -3]);
+    let r: i16x4 = transmute(vpadd_s16(transmute(a), transmute(b)));
+    let e = i16x4::from([3, 7, -1, -5]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_s32() {
+    let a = i32x2::from([1, 2]);
+    let b = i32x2::from([0, -1]);
+    let r: i32x2 = transmute(vpadd_s32(transmute(a), transmute(b)));
+    let e = i32x2::from([3, -1]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_s8() {
+    let a = i8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = i8x8::from([0, -1, -2, -3, -4, -5, -6, -7]);
+    let r: i8x8 = transmute(vpadd_s8(transmute(a), transmute(b)));
+    let e = i8x8::from([3, 7, 11, 15, -1, -5, -9, -13]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_u16() {
+    let a = u16x4::from([1, 2, 3, 4]);
+    let b = u16x4::from([30, 31, 32, 33]);
+    let r: u16x4 = transmute(vpadd_u16(transmute(a), transmute(b)));
+    let e = u16x4::from([3, 7, 61, 65]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_u32() {
+    let a = u32x2::from([1, 2]);
+    let b = u32x2::from([30, 31]);
+    let r: u32x2 = transmute(vpadd_u32(transmute(a), transmute(b)));
+    let e = u32x2::from([3, 61]);
+    assert_eq!(r, e);
+}
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vpadd_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = u8x8::from([30, 31, 32, 33, 34, 35, 36, 37]);
+    let r: u8x8 = transmute(vpadd_u8(transmute(a), transmute(b)));
+    let e = u8x8::from([3, 7, 11, 15, 61, 65, 69, 73]);
+    assert_eq!(r, e);
+}
+
 #[cfg(target_arch = "aarch64")]
 fn main() {
     unsafe {
@@ -148,6 +197,13 @@ fn main() {
         test_vpmax_u16();
         test_vpmax_u32();
         test_vpmax_f32();
+
+        test_vpadd_s16();
+        test_vpadd_s32();
+        test_vpadd_s8();
+        test_vpadd_u16();
+        test_vpadd_u32();
+        test_vpadd_u8();
     }
 }
 
diff --git a/src/intrinsics/llvm_aarch64.rs b/src/intrinsics/llvm_aarch64.rs
index ed318a89fa05f..fdad5a474d6d1 100644
--- a/src/intrinsics/llvm_aarch64.rs
+++ b/src/intrinsics/llvm_aarch64.rs
@@ -228,6 +228,18 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             );
         }
 
+        _ if intrinsic.starts_with("llvm.aarch64.neon.addp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().iadd(x_lane, y_lane),
+            );
+        }
+
         // FIXME generalize vector types
         "llvm.aarch64.neon.tbl1.v16i8" => {
             intrinsic_args!(fx, args => (t, idx); intrinsic);

From 70a6abfd29c20fbaa7bee58259a16a62ed48ad93 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 4 Nov 2023 19:37:36 +0000
Subject: [PATCH 15/24] Add unsigned saturating add/sub intrinsics for aarch64

---
 example/neon.rs                | 21 +++++++++++++++++++++
 src/intrinsics/llvm_aarch64.rs |  8 ++++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/example/neon.rs b/example/neon.rs
index 6ea053d0b0046..1033713732240 100644
--- a/example/neon.rs
+++ b/example/neon.rs
@@ -180,6 +180,24 @@ unsafe fn test_vpadd_u8() {
     assert_eq!(r, e);
 }
 
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vqsub_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]);
+    let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]);
+    let r: u8x8 = transmute(vqsub_u8(transmute(a), transmute(b)));
+    let e = u8x8::from([0, 1, 2, 3, 0, 0, 0, 218]);
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vqadd_u8() {
+    let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]);
+    let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]);
+    let r: u8x8 = transmute(vqadd_u8(transmute(a), transmute(b)));
+    let e = u8x8::from([31, 3, 4, 5, 39, 0xff, 43, 0xff]);
+    assert_eq!(r, e);
+}
+
 #[cfg(target_arch = "aarch64")]
 fn main() {
     unsafe {
@@ -204,6 +222,9 @@ fn main() {
         test_vpadd_u16();
         test_vpadd_u32();
         test_vpadd_u8();
+
+        test_vqsub_u8();
+        test_vqadd_u8();
     }
 }
 
diff --git a/src/intrinsics/llvm_aarch64.rs b/src/intrinsics/llvm_aarch64.rs
index fdad5a474d6d1..ee098be1fce6b 100644
--- a/src/intrinsics/llvm_aarch64.rs
+++ b/src/intrinsics/llvm_aarch64.rs
@@ -44,7 +44,9 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             });
         }
 
-        _ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v") => {
+        _ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v")
+            || intrinsic.starts_with("llvm.aarch64.neon.uqadd.v") =>
+        {
             intrinsic_args!(fx, args => (x, y); intrinsic);
 
             simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
@@ -52,7 +54,9 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             });
         }
 
-        _ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v") => {
+        _ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v")
+            || intrinsic.starts_with("llvm.aarch64.neon.uqsub.v") =>
+        {
             intrinsic_args!(fx, args => (x, y); intrinsic);
 
             simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {

From f824da66c60a5b252bab6bcbabcc30a489b29019 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 4 Nov 2023 19:41:08 +0000
Subject: [PATCH 16/24] Make neon example build in all arches

---
 example/neon.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/neon.rs b/example/neon.rs
index 1033713732240..41b5189056006 100644
--- a/example/neon.rs
+++ b/example/neon.rs
@@ -228,5 +228,5 @@ fn main() {
     }
 }
 
-#[cfg(target_arch = "x86_64")]
+#[cfg(not(target_arch = "aarch64"))]
 fn main() {}

From 209476e33acbeb14213c2edbb6e877dd251d4943 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 4 Nov 2023 19:47:56 +0000
Subject: [PATCH 17/24] Only import aarch64 intrinsics on aarch64

---
 example/neon.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/example/neon.rs b/example/neon.rs
index 41b5189056006..bad26947967dc 100644
--- a/example/neon.rs
+++ b/example/neon.rs
@@ -1,6 +1,8 @@
 // Most of these tests are copied from https://github.com/japaric/stdsimd/blob/0f4413d01c4f0c3ffbc5a69e9a37fbc7235b31a9/coresimd/arm/neon.rs
 
 #![feature(portable_simd)]
+
+#[cfg(target_arch = "aarch64")]
 use std::arch::aarch64::*;
 use std::mem::transmute;
 use std::simd::*;

From 6a53acefd8520d2d3a92dc56db78e4c5e5c26cf9 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sun, 5 Nov 2023 18:24:39 +0000
Subject: [PATCH 18/24] Implement _mm256_permute2f128_ps and
 _mm256_permute2f128_pd intrinsics

---
 src/intrinsics/llvm_x86.rs | 38 +++++++++++++-------------
 src/value_and_place.rs     | 56 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 19 deletions(-)

diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs
index 6fd85ab8bf1d5..97759b59ebe62 100644
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@@ -172,8 +172,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
                 }
             }
         }
-        "llvm.x86.avx2.vperm2i128" => {
+        "llvm.x86.avx2.vperm2i128"
+        | "llvm.x86.avx.vperm2f128.ps.256"
+        | "llvm.x86.avx.vperm2f128.pd.256" => {
             // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd
             let (a, b, imm8) = match args {
                 [a, b, imm8] => (a, b, imm8),
                 _ => bug!("wrong number of args for intrinsic {intrinsic}"),
@@ -182,19 +186,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             let b = codegen_operand(fx, b);
             let imm8 = codegen_operand(fx, imm8).load_scalar(fx);
 
-            let a_0 = a.value_lane(fx, 0).load_scalar(fx);
-            let a_1 = a.value_lane(fx, 1).load_scalar(fx);
-            let a_low = fx.bcx.ins().iconcat(a_0, a_1);
-            let a_2 = a.value_lane(fx, 2).load_scalar(fx);
-            let a_3 = a.value_lane(fx, 3).load_scalar(fx);
-            let a_high = fx.bcx.ins().iconcat(a_2, a_3);
+            let a_low = a.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
+            let a_high = a.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);
 
-            let b_0 = b.value_lane(fx, 0).load_scalar(fx);
-            let b_1 = b.value_lane(fx, 1).load_scalar(fx);
-            let b_low = fx.bcx.ins().iconcat(b_0, b_1);
-            let b_2 = b.value_lane(fx, 2).load_scalar(fx);
-            let b_3 = b.value_lane(fx, 3).load_scalar(fx);
-            let b_high = fx.bcx.ins().iconcat(b_2, b_3);
+            let b_low = b.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
+            let b_high = b.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);
 
             fn select4(
                 fx: &mut FunctionCx<'_, '_, '_>,
@@ -219,16 +215,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
 
             let control0 = imm8;
             let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
-            let (res_0, res_1) = fx.bcx.ins().isplit(res_low);
 
             let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
             let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
-            let (res_2, res_3) = fx.bcx.ins().isplit(res_high);
 
-            ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());
-            ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());
-            ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
-            ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
+            ret.place_typed_lane(fx, fx.tcx.types.u128, 0).to_ptr().store(
+                fx,
+                res_low,
+                MemFlags::trusted(),
+            );
+            ret.place_typed_lane(fx, fx.tcx.types.u128, 1).to_ptr().store(
+                fx,
+                res_high,
+                MemFlags::trusted(),
+            );
         }
         "llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {
             let a = match args {
diff --git a/src/value_and_place.rs b/src/value_and_place.rs
index 5f0aa6c5581dd..21ad2a835fc96 100644
--- a/src/value_and_place.rs
+++ b/src/value_and_place.rs
@@ -243,6 +243,34 @@ impl<'tcx> CValue<'tcx> {
         let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
         let lane_layout = fx.layout_of(lane_ty);
         assert!(lane_idx < lane_count);
+
+        match self.0 {
+            CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
+            CValueInner::ByRef(ptr, None) => {
+                let field_offset = lane_layout.size * lane_idx;
+                let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
+                CValue::by_ref(field_ptr, lane_layout)
+            }
+            CValueInner::ByRef(_, Some(_)) => unreachable!(),
+        }
+    }
+
+    /// Like [`CValue::value_field`] except using the passed type as lane type instead of the one
+    /// specified by the vector type.
+    pub(crate) fn value_typed_lane(
+        self,
+        fx: &mut FunctionCx<'_, '_, 'tcx>,
+        lane_ty: Ty<'tcx>,
+        lane_idx: u64,
+    ) -> CValue<'tcx> {
+        let layout = self.1;
+        assert!(layout.ty.is_simd());
+        let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+        let lane_layout = fx.layout_of(lane_ty);
+        assert!(
+            (lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
+        );
+
         match self.0 {
             CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
             CValueInner::ByRef(ptr, None) => {
@@ -734,6 +762,34 @@ impl<'tcx> CPlace<'tcx> {
         }
     }
 
+    /// Like [`CPlace::place_field`] except using the passed type as lane type instead of the one
+    /// specified by the vector type.
+    pub(crate) fn place_typed_lane(
+        self,
+        fx: &mut FunctionCx<'_, '_, 'tcx>,
+        lane_ty: Ty<'tcx>,
+        lane_idx: u64,
+    ) -> CPlace<'tcx> {
+        let layout = self.layout();
+        assert!(layout.ty.is_simd());
+        let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+        let lane_layout = fx.layout_of(lane_ty);
+        assert!(
+            (lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
+        );
+
+        match self.inner {
+            CPlaceInner::Var(_, _) => unreachable!(),
+            CPlaceInner::VarPair(_, _, _) => unreachable!(),
+            CPlaceInner::Addr(ptr, None) => {
+                let field_offset = lane_layout.size * lane_idx;
+                let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
+                CPlace::for_ptr(field_ptr, lane_layout)
+            }
+            CPlaceInner::Addr(_, Some(_)) => unreachable!(),
+        }
+    }
+
     pub(crate) fn place_index(
         self,
         fx: &mut FunctionCx<'_, '_, 'tcx>,

From 438194980b9184fc13dab03444e604db9f5a8a07 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sun, 5 Nov 2023 19:49:30 +0000
Subject: [PATCH 19/24] Implement all avx2 intrinsics used by the image crate

---
 src/intrinsics/llvm_x86.rs | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs
index 97759b59ebe62..9c415d468d871 100644
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@@ -304,7 +304,9 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
                 fx.bcx.ins().sshr(a_lane, saturated_count)
             });
         }
-        "llvm.x86.sse2.psad.bw" => {
+        "llvm.x86.sse2.psad.bw" | "llvm.x86.avx2.psad.bw" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8&ig_expand=5770
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8&ig_expand=5771
             intrinsic_args!(fx, args => (a, b); intrinsic);
 
             assert_eq!(a.layout(), b.layout());
@@ -335,7 +337,9 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
                 ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
             }
         }
-        "llvm.x86.ssse3.pmadd.ub.sw.128" => {
+        "llvm.x86.ssse3.pmadd.ub.sw.128" | "llvm.x86.avx2.pmadd.ub.sw" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16&ig_expand=4267
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16&ig_expand=4270
             intrinsic_args!(fx, args => (a, b); intrinsic);
 
             let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx);
@@ -374,7 +378,9 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
                 ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
             }
         }
-        "llvm.x86.sse2.pmadd.wd" => {
+        "llvm.x86.sse2.pmadd.wd" | "llvm.x86.avx2.pmadd.wd" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16&ig_expand=4231
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16&ig_expand=4234
             intrinsic_args!(fx, args => (a, b); intrinsic);
 
             assert_eq!(a.layout(), b.layout());

From 61e38ceea7fb8dad16a0e1c42b70417b057e22a7 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Tue, 7 Nov 2023 08:59:16 +0000
Subject: [PATCH 20/24] Implement all SSE intrinsics used by the jpeg-decoder
 crate

---
 src/intrinsics/llvm_x86.rs | 71 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs
index 9c415d468d871..622688fb82c16 100644
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@@ -413,6 +413,77 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
                 ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
             }
         }
+
+        "llvm.x86.ssse3.pmul.hr.sw.128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16&ig_expand=4782
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i16);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i16);
+            assert_eq!(lane_count, ret_lane_count);
+
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
+            for out_lane_idx in 0..lane_count {
+                let a_lane = a.value_lane(fx, out_lane_idx).load_scalar(fx);
+                let a_lane = fx.bcx.ins().sextend(types::I32, a_lane);
+                let b_lane = b.value_lane(fx, out_lane_idx).load_scalar(fx);
+                let b_lane = fx.bcx.ins().sextend(types::I32, b_lane);
+
+                let mul: Value = fx.bcx.ins().imul(a_lane, b_lane);
+                let shifted = fx.bcx.ins().ushr_imm(mul, 14);
+                let incremented = fx.bcx.ins().iadd_imm(shifted, 1);
+                let shifted_again = fx.bcx.ins().ushr_imm(incremented, 1);
+
+                let res_lane = fx.bcx.ins().ireduce(types::I16, shifted_again);
+                let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+                ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.sse2.packuswb.128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i16);
+            assert_eq!(ret_lane_ty, fx.tcx.types.u8);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let zero = fx.bcx.ins().iconst(types::I16, 0);
+            let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
+
+            for idx in 0..lane_count {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
         _ => {
             fx.tcx
                 .sess

From 0a35232c8523968444bc8f86ed630d886cfe6525 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Tue, 7 Nov 2023 09:42:01 +0000
Subject: [PATCH 21/24] Implement all vendor intrinsics used by the httparse
 crate

---
 src/intrinsics/llvm_x86.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs
index 622688fb82c16..6a91ce42fdf6b 100644
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@@ -27,6 +27,16 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             crate::inline_asm::codegen_xgetbv(fx, xcr_no, ret);
         }
 
+        "llvm.x86.sse3.ldu.dq" | "llvm.x86.avx.ldu.dq.256" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128&ig_expand=4009
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256&ig_expand=4010
+            intrinsic_args!(fx, args => (ptr); intrinsic);
+
+            // FIXME correctly handle unalignedness
+            let val = CValue::by_ref(Pointer::new(ptr.load_scalar(fx)), ret.layout());
+            ret.write_cvalue(fx, val);
+        }
+
         "llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {
             let (x, y, kind) = match args {
                 [x, y, kind] => (x, y, kind),

From ecf79a304a0676d2ca2d071127494e83e4aacb90 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Tue, 7 Nov 2023 13:32:11 +0000
Subject: [PATCH 22/24] Implement all vendor intrinsics used by the fimg crate

---
 src/intrinsics/llvm_x86.rs | 154 +++++++++++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)

diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs
index 6a91ce42fdf6b..0d557469c7782 100644
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@@ -494,6 +494,160 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             }
         }
 
+        "llvm.x86.avx2.packuswb" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i16);
+            assert_eq!(ret_lane_ty, fx.tcx.types.u8);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let zero = fx.bcx.ins().iconst(types::I16, 0);
+            let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
+
+            for idx in 0..lane_count / 2 {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.sse2.packssdw.128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i32);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i16);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
+            let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
+
+            for idx in 0..lane_count {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.avx2.packssdw" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i32);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i16);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
+            let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
+
+            for idx in 0..lane_count / 2 {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
         _ => {
             fx.tcx
                 .sess

From 864973135a3449e2ce3be91d894509dfad2d94a6 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Tue, 7 Nov 2023 18:08:30 +0000
Subject: [PATCH 23/24] Implement all vendor intrinsics used by the simd-json
 crate

---
 src/intrinsics/llvm_x86.rs | 138 +++++++++++++++++++++++++++++++++++++
 1 file changed, 138 insertions(+)

diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs
index 0d557469c7782..4c536048626ee 100644
--- a/src/intrinsics/llvm_x86.rs
+++ b/src/intrinsics/llvm_x86.rs
@@ -590,6 +590,44 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             }
         }
 
+        "llvm.x86.sse41.packusdw" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i32);
+            assert_eq!(ret_lane_ty, fx.tcx.types.u16);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let min_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MIN));
+            let max_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MAX));
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.u16);
+
+            for idx in 0..lane_count {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().umax(lane, min_u16);
+                let sat = fx.bcx.ins().umin(sat, max_u16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().umax(lane, min_u16);
+                let sat = fx.bcx.ins().umin(sat, max_u16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
         "llvm.x86.avx2.packssdw" => {
             // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
             intrinsic_args!(fx, args => (a, b); intrinsic);
@@ -648,6 +686,106 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             }
         }
 
+        "llvm.x86.pclmulqdq" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
+            intrinsic_args!(fx, args => (a, b, imm8); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i64);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i64);
+            assert_eq!(lane_count, 2);
+            assert_eq!(ret_lane_count, 2);
+
+            let imm8 = imm8.load_scalar(fx);
+
+            let control0 = fx.bcx.ins().band_imm(imm8, 0b0000_0001);
+            let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);
+            let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);
+            let temp1 = fx.bcx.ins().select(control0, a_lane1, a_lane0);
+
+            let control4 = fx.bcx.ins().band_imm(imm8, 0b0001_0000);
+            let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);
+            let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);
+            let temp2 = fx.bcx.ins().select(control4, b_lane1, b_lane0);
+
+            fn extract_bit(fx: &mut FunctionCx<'_, '_, '_>, val: Value, bit: i64) -> Value {
+                let tmp = fx.bcx.ins().ushr_imm(val, bit);
+                fx.bcx.ins().band_imm(tmp, 1)
+            }
+
+            let mut res1 = fx.bcx.ins().iconst(types::I64, 0);
+            for i in 0..=63 {
+                let x = extract_bit(fx, temp1, 0);
+                let y = extract_bit(fx, temp2, i);
+                let mut temp = fx.bcx.ins().band(x, y);
+                for j in 1..=i {
+                    let x = extract_bit(fx, temp1, j);
+                    let y = extract_bit(fx, temp2, i - j);
+                    let z = fx.bcx.ins().band(x, y);
+                    temp = fx.bcx.ins().bxor(temp, z);
+                }
+                let temp = fx.bcx.ins().ishl_imm(temp, i);
+                res1 = fx.bcx.ins().bor(res1, temp);
+            }
+            ret.place_lane(fx, 0).to_ptr().store(fx, res1, MemFlags::trusted());
+
+            let mut res2 = fx.bcx.ins().iconst(types::I64, 0);
+            for i in 64..=127 {
+                let mut temp = fx.bcx.ins().iconst(types::I64, 0);
+                for j in i - 63..=63 {
+                    let x = extract_bit(fx, temp1, j);
+                    let y = extract_bit(fx, temp2, i - j);
+                    let z = fx.bcx.ins().band(x, y);
+                    temp = fx.bcx.ins().bxor(temp, z);
+                }
+                let temp = fx.bcx.ins().ishl_imm(temp, i);
+                res2 = fx.bcx.ins().bor(res2, temp);
+            }
+            ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted());
+        }
+
+        "llvm.x86.avx.ptestz.256" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=6945
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i64);
+            assert_eq!(ret.layout().ty, fx.tcx.types.i32);
+            assert_eq!(lane_count, 4);
+
+            let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);
+            let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);
+            let a_lane2 = a.value_lane(fx, 2).load_scalar(fx);
+            let a_lane3 = a.value_lane(fx, 3).load_scalar(fx);
+            let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);
+            let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);
+            let b_lane2 = b.value_lane(fx, 2).load_scalar(fx);
+            let b_lane3 = b.value_lane(fx, 3).load_scalar(fx);
+
+            let zero0 = fx.bcx.ins().band(a_lane0, b_lane0);
+            let zero1 = fx.bcx.ins().band(a_lane1, b_lane1);
+            let zero2 = fx.bcx.ins().band(a_lane2, b_lane2);
+            let zero3 = fx.bcx.ins().band(a_lane3, b_lane3);
+
+            let all_zero0 = fx.bcx.ins().bor(zero0, zero1);
+            let all_zero1 = fx.bcx.ins().bor(zero2, zero3);
+            let all_zero = fx.bcx.ins().bor(all_zero0, all_zero1);
+
+            let res = fx.bcx.ins().icmp_imm(IntCC::Equal, all_zero, 0);
+            let res = CValue::by_val(
+                fx.bcx.ins().uextend(types::I32, res),
+                fx.layout_of(fx.tcx.types.i32),
+            );
+            ret.write_cvalue(fx, res);
+        }
+
         _ => {
             fx.tcx
                 .sess

From c84d1871dc4456539b7b578830268ab3539915d0 Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Fri, 10 Nov 2023 11:18:14 +0000
Subject: [PATCH 24/24] Rustup to rustc 1.75.0-nightly (0f44eb32f 2023-11-09)

---
 patches/stdlib-lock.toml | 9 ++++-----
 rust-toolchain           | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/patches/stdlib-lock.toml b/patches/stdlib-lock.toml
index 9902bca8eab27..8a690bada0df5 100644
--- a/patches/stdlib-lock.toml
+++ b/patches/stdlib-lock.toml
@@ -58,9 +58,9 @@ dependencies = [
 
 [[package]]
 name = "compiler_builtins"
-version = "0.1.100"
+version = "0.1.103"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6c0f24437059853f0fa64afc51f338f93647a3de4cf3358ba1bb4171a199775"
+checksum = "a3b73c3443a5fd2438d7ba4853c64e4c8efc2404a9e28a9234cc2d5eebc6c242"
 dependencies = [
  "cc",
  "rustc-std-workspace-core",
@@ -158,9 +158,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.149"
+version = "0.2.150"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
+checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
 dependencies = [
  "rustc-std-workspace-core",
 ]
@@ -415,7 +415,6 @@ dependencies = [
 name = "unwind"
 version = "0.0.0"
 dependencies = [
- "cc",
  "cfg-if",
  "compiler_builtins",
  "core",
diff --git a/rust-toolchain b/rust-toolchain
index 54683e1ebb18b..b832b06e0ffba 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1,3 +1,3 @@
 [toolchain]
-channel = "nightly-2023-11-02"
+channel = "nightly-2023-11-10"
 components = ["rust-src", "rustc-dev", "llvm-tools"]