From e7664e004a4187d9f8a0790c6ddd5fd63fd680e8 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 2 Jul 2025 16:49:31 -0700 Subject: [PATCH 1/6] offload wrapper generation --- compiler/rustc_codegen_llvm/src/builder.rs | 1 + .../src/builder/gpu_offload.rs | 4 +- .../src/builder/gpu_wrapper.rs | 119 ++++++++++++++++++ compiler/rustc_codegen_llvm/src/context.rs | 17 +++ compiler/rustc_codegen_llvm/src/lib.rs | 15 +++ compiler/rustc_codegen_llvm/src/llvm/ffi.rs | 9 +- 6 files changed, 162 insertions(+), 3 deletions(-) create mode 100644 compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs index 0ade9edb0d2ea..23976354cacdf 100644 --- a/compiler/rustc_codegen_llvm/src/builder.rs +++ b/compiler/rustc_codegen_llvm/src/builder.rs @@ -4,6 +4,7 @@ use std::{iter, ptr}; pub(crate) mod autodiff; pub(crate) mod gpu_offload; +pub(crate) mod gpu_wrapper; use libc::{c_char, c_uint, size_t}; use rustc_abi as abi; diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index 1280ab1442a09..da7bde86463bc 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -12,7 +12,7 @@ use crate::llvm::{self, Linkage, Type, Value}; use crate::{LlvmCodegenBackend, SimpleCx, attributes}; pub(crate) fn handle_gpu_code<'ll>( - _cgcx: &CodegenContext, + cgcx: &CodegenContext, cx: &'ll SimpleCx<'_>, ) { // The offload memory transfer type for each kernel @@ -26,8 +26,8 @@ pub(crate) fn handle_gpu_code<'ll>( kernels.push(kernel); } } - gen_call_handling(&cx, &kernels, &o_types); + crate::builder::gpu_wrapper::gen_image_wrapper_module(&cgcx); } // What is our @1 here? A magic global, used in our data_{begin/update/end}_mapper: diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs new file mode 100644 index 0000000000000..037208d656a81 --- /dev/null +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs @@ -0,0 +1,119 @@ +use std::ffi::CString; + +use llvm::Linkage::*; +use rustc_abi::Align; +use rustc_codegen_ssa::back::write::CodegenContext; +use rustc_codegen_ssa::traits::BaseTypeCodegenMethods; + +use crate::builder::gpu_offload::*; +use crate::llvm::{self, Visibility}; +use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx}; + +pub(crate) fn create_struct_ty<'ll>( + cx: &'ll SimpleCx<'_>, + name: &str, + tys: &[&'ll llvm::Type], +) -> &'ll llvm::Type { + let entry_struct_name = CString::new(name).unwrap(); + unsafe { + let entry_struct = llvm::LLVMStructCreateNamed(cx.llcx, entry_struct_name.as_ptr()); + llvm::LLVMStructSetBody(entry_struct, tys.as_ptr(), tys.len() as u32, 0); + entry_struct + } +} + +// We don't copy types from other functions because we generate a new module and context. +// Bringing in types from other contexts would likely cause issues. +pub(crate) fn gen_image_wrapper_module(cgcx: &CodegenContext) { + let dl_cstr = CString::new("e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9").unwrap(); + let target_cstr = CString::new("amdgcn-amd-amdhsa").unwrap(); + let name = "offload.wrapper.module"; + let m: crate::ModuleLlvm = + ModuleLlvm::new_simple(name, dl_cstr.into_raw(), target_cstr.into_raw(), &cgcx).unwrap(); + let cx = SimpleCx::new(m.llmod(), m.llcx, cgcx.pointer_size); + let tptr = cx.type_ptr(); + let ti64 = cx.type_i64(); + let ti32 = cx.type_i32(); + let ti16 = cx.type_i16(); + + let entry_fields = [ti64, ti16, ti16, ti32, tptr, tptr, ti64, ti64, tptr]; + create_struct_ty(&cx, "__tgt_offload_entry", &entry_fields); + create_struct_ty(&cx, "__tgt_device_image", &[tptr, tptr, tptr, tptr]); + create_struct_ty(&cx, "__tgt_bin_desc", &[ti32, tptr, tptr, tptr]); + + let offload_entry_ty = add_tgt_offload_entry(&cx); + let offload_entry_arr = cx.type_array(offload_entry_ty, 0); + + let c_name = CString::new("__start_omp_offloading_entries").unwrap(); + let llglobal = llvm::add_global(cx.llmod, offload_entry_arr, &c_name); + llvm::set_global_constant(llglobal, true); + llvm::set_linkage(llglobal, ExternalLinkage); + llvm::set_visibility(llglobal, Visibility::Hidden); + let c_name = CString::new("__stop_omp_offloading_entries").unwrap(); + let llglobal = llvm::add_global(cx.llmod, offload_entry_arr, &c_name); + llvm::set_global_constant(llglobal, true); + llvm::set_linkage(llglobal, ExternalLinkage); + llvm::set_visibility(llglobal, Visibility::Hidden); + + let c_name = CString::new("__dummy.omp_offloading_entries").unwrap(); + let llglobal = llvm::add_global(cx.llmod, offload_entry_arr, &c_name); + llvm::set_global_constant(llglobal, true); + llvm::set_linkage(llglobal, InternalLinkage); + let c_section_name = CString::new("omp_offloading_entries").unwrap(); + llvm::set_section(llglobal, &c_section_name); + let zeroinit = cx.const_null(offload_entry_arr); + llvm::set_initializer(llglobal, zeroinit); + + CString::new("llvm.compiler.used").unwrap(); + let arr_val = cx.const_array(tptr, &[llglobal]); + let c_section_name = CString::new("llvm.metadata").unwrap(); + let llglobal = add_global(&cx, "llvm.compiler.used", arr_val, AppendingLinkage); + llvm::set_section(llglobal, &c_section_name); + llvm::set_global_constant(llglobal, false); + + //@llvm.compiler.used = appending global [1 x ptr] [ptr @__dummy.omp_offloading_entries], section "llvm.metadata" + + let mapper_fn_ty = cx.type_func(&[tptr], cx.type_void()); + crate::declare::declare_simple_fn( + &cx, + &"__tgt_unregister_lib", + llvm::CallConv::CCallConv, + llvm::UnnamedAddr::No, + llvm::Visibility::Default, + mapper_fn_ty, + ); + crate::declare::declare_simple_fn( + &cx, + &"__tgt_register_lib", + llvm::CallConv::CCallConv, + llvm::UnnamedAddr::No, + llvm::Visibility::Default, + mapper_fn_ty, + ); + crate::declare::declare_simple_fn( + &cx, + &"atexit", + llvm::CallConv::CCallConv, + llvm::UnnamedAddr::No, + llvm::Visibility::Default, + cx.type_func(&[tptr], ti32), + ); + + let unknown_txt = "11111111111111"; + let c_entry_name = CString::new(unknown_txt).unwrap(); + let c_val = c_entry_name.as_bytes_with_nul(); + let initializer = crate::common::bytes_in_context(cx.llcx, c_val); + let llglobal = + add_unnamed_global(&cx, &".omp_offloading.device_image", initializer, InternalLinkage); + let c_section_name = CString::new(".llvm.offloading").unwrap(); + llvm::set_section(llglobal, &c_section_name); + llvm::set_alignment(llglobal, Align::EIGHT); + + unsafe { + llvm::LLVMPrintModuleToFile( + cx.llmod, + CString::new("rustmagic.openmp.image.wrapper.ll").unwrap().as_ptr(), + std::ptr::null_mut(), + ); + } +} diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs index ee77774c68832..2563d2e18e99d 100644 --- a/compiler/rustc_codegen_llvm/src/context.rs +++ b/compiler/rustc_codegen_llvm/src/context.rs @@ -159,6 +159,23 @@ fn to_llvm_tls_model(tls_model: TlsModel) -> llvm::ThreadLocalMode { } } +// FIXME(offload): This method is not relying on a tcx. We might still want to try to share some of +// the logic with create_module, e.g. the target_data_layout handling. +pub(crate) unsafe fn create_simple_module<'ll>( + llcx: &'ll llvm::Context, + target_data_layout: *const i8, + target_triple: *const i8, + mod_name: &str, +) -> &'ll llvm::Module { + let mod_name = SmallCStr::new(mod_name); + let llmod = unsafe { llvm::LLVMModuleCreateWithNameInContext(mod_name.as_ptr(), llcx) }; + unsafe { + llvm::LLVMSetDataLayout(llmod, target_data_layout); + llvm::LLVMSetTarget(llmod, target_triple); + } + llmod +} + pub(crate) unsafe fn create_module<'ll>( tcx: TyCtxt<'_>, llcx: &'ll llvm::Context, diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs index aaf21f9ada9a5..a077f63749136 100644 --- a/compiler/rustc_codegen_llvm/src/lib.rs +++ b/compiler/rustc_codegen_llvm/src/lib.rs @@ -388,6 +388,21 @@ unsafe impl Send for ModuleLlvm {} unsafe impl Sync for ModuleLlvm {} impl ModuleLlvm { + fn new_simple( + name: &str, + dl_cstr: *const i8, + target_cstr: *const i8, + cgcx: &CodegenContext, + ) -> Result { + unsafe { + let llcx = llvm::LLVMRustContextCreate(false); + let llmod_raw = context::create_simple_module(llcx, dl_cstr, target_cstr, name); + let dcx = cgcx.create_dcx(); + let tm = ModuleLlvm::tm_from_cgcx(cgcx, name, dcx.handle())?; + Ok(ModuleLlvm { llmod_raw, llcx, tm: ManuallyDrop::new(tm) }) + } + } + fn new(tcx: TyCtxt<'_>, mod_name: &str) -> Self { unsafe { let llcx = llvm::LLVMRustContextCreate(tcx.sess.fewer_names()); diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index edfb29dd1be72..1c7532acdc02e 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1005,16 +1005,23 @@ unsafe extern "C" { ) -> MetadataKindId; // Create modules. + pub(crate) fn LLVMCloneModule(M: &Module) -> &Module; pub(crate) fn LLVMModuleCreateWithNameInContext( ModuleID: *const c_char, C: &Context, ) -> &Module; - pub(crate) safe fn LLVMCloneModule(M: &Module) -> &Module; + pub(crate) fn LLVMPrintModuleToFile( + M: &Module, + Name: *const c_char, + Error_message: *mut c_char, + ); /// Data layout. See Module::getDataLayout. pub(crate) fn LLVMGetDataLayoutStr(M: &Module) -> *const c_char; pub(crate) fn LLVMSetDataLayout(M: &Module, Triple: *const c_char); + pub(crate) fn LLVMSetTarget(M: &Module, Name: *const c_char); + /// Append inline assembly to a module. See `Module::appendModuleInlineAsm`. pub(crate) fn LLVMAppendModuleInlineAsm( M: &Module, From b3e38cdadf526082d8cf15068d5c2381c17cd1d8 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 2 Jul 2025 16:53:37 -0700 Subject: [PATCH 2/6] postpone device side generation --- compiler/rustc_codegen_llvm/src/builder.rs | 1 + .../src/builder/gpu_device.rs | 113 ++++++++++++++++++ compiler/rustc_codegen_llvm/src/llvm/ffi.rs | 8 ++ compiler/rustc_codegen_llvm/src/llvm/mod.rs | 9 ++ 4 files changed, 131 insertions(+) create mode 100644 compiler/rustc_codegen_llvm/src/builder/gpu_device.rs diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs index 23976354cacdf..67e7951124879 100644 --- a/compiler/rustc_codegen_llvm/src/builder.rs +++ b/compiler/rustc_codegen_llvm/src/builder.rs @@ -3,6 +3,7 @@ use std::ops::Deref; use std::{iter, ptr}; pub(crate) mod autodiff; +pub(crate) mod gpu_device; pub(crate) mod gpu_offload; pub(crate) mod gpu_wrapper; diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_device.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_device.rs new file mode 100644 index 0000000000000..63416743ca322 --- /dev/null +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_device.rs @@ -0,0 +1,113 @@ +use std::ffi::{CString, c_uint}; + +use llvm::Linkage::*; +use rustc_codegen_ssa::back::write::CodegenContext; + +use crate::llvm::{self, Linkage}; +use crate::{LlvmCodegenBackend, SimpleCx}; + +fn add_unnamed_global_in_addrspace<'ll>( + cx: &SimpleCx<'ll>, + name: &str, + initializer: &'ll llvm::Value, + l: Linkage, + addrspace: u32, +) -> &'ll llvm::Value { + let llglobal = add_global_in_addrspace(cx, name, initializer, l, addrspace); + unsafe { llvm::LLVMSetUnnamedAddress(llglobal, llvm::UnnamedAddr::Global) }; + llglobal +} + +pub(crate) fn add_global_in_addrspace<'ll>( + cx: &SimpleCx<'ll>, + name: &str, + initializer: &'ll llvm::Value, + l: Linkage, + addrspace: u32, +) -> &'ll llvm::Value { + let c_name = CString::new(name).unwrap(); + let llglobal: &'ll llvm::Value = llvm::add_global_in_addrspace( + cx.llmod, + cx.val_ty(initializer), + &c_name, + addrspace as c_uint, + ); + llvm::set_global_constant(llglobal, true); + llvm::set_linkage(llglobal, l); + llvm::set_initializer(llglobal, initializer); + llglobal +} + +#[allow(unused)] +pub(crate) fn gen_asdf<'ll>(cgcx: &CodegenContext, _old_cx: &SimpleCx<'ll>) { + let llcx = unsafe { llvm::LLVMRustContextCreate(false) }; + let module_name = CString::new("offload.wrapper.module").unwrap(); + let llmod = unsafe { llvm::LLVMModuleCreateWithNameInContext(module_name.as_ptr(), llcx) }; + let cx = SimpleCx::new(llmod, llcx, cgcx.pointer_size); + let initializer = cx.get_const_i32(0); + add_unnamed_global_in_addrspace(&cx, "__omp_rtl_debug_kind", initializer, WeakODRLinkage, 1); + add_unnamed_global_in_addrspace( + &cx, + "__omp_rtl_assume_teams_oversubscription", + initializer, + WeakODRLinkage, + 1, + ); + add_unnamed_global_in_addrspace( + &cx, + "__omp_rtl_assume_threads_oversubscription", + initializer, + WeakODRLinkage, + 1, + ); + add_unnamed_global_in_addrspace( + &cx, + "__omp_rtl_assume_no_thread_state", + initializer, + WeakODRLinkage, + 1, + ); + add_unnamed_global_in_addrspace( + &cx, + "__oclc_ABI_version", + cx.get_const_i32(500), + WeakODRLinkage, + 4, + ); + unsafe { + llvm::LLVMPrintModuleToFile( + llmod, + CString::new("rustmagic-openmp-amdgcn-amd-amdhsa-gfx90a.ll").unwrap().as_ptr(), + std::ptr::null_mut(), + ); + + // Clean up + llvm::LLVMDisposeModule(llmod); + llvm::LLVMContextDispose(llcx); + } + // TODO: addressspace 1 or 4 +} +// source_filename = "mem.cpp" +// GPU: target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +// CPU: target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +// target triple = "amdgcn-amd-amdhsa" +// +// @__omp_rtl_debug_kind = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0 +// @__omp_rtl_assume_teams_oversubscription = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0 +// @__omp_rtl_assume_threads_oversubscription = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0 +// @__omp_rtl_assume_no_thread_state = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0 +// @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0 +// @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 +// +// !llvm.module.flags = !{!0, !1, !2, !3, !4} +// !opencl.ocl.version = !{!5} +// !llvm.ident = !{!6, !7} +// +// !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +// !1 = !{i32 1, !"wchar_size", i32 4} +// !2 = !{i32 7, !"openmp", i32 51} +// !3 = !{i32 7, !"openmp-device", i32 51} +// !4 = !{i32 8, !"PIC Level", i32 2} +// !5 = !{i32 2, i32 0} +// !6 = !{!"clang version 20.1.5-rust-1.89.0-nightly (https://github.com/rust-lang/llvm-project.git c1118fdbb3024157df7f4cfe765f2b0b4339e8a2)"} +// !7 = !{!"AMD clang version 19.0.0git (https://github.com/RadeonOpenCompute/llvm-project roc-6.4.0 25133 c7fe45cf4b819c5991fe208aaa96edf142730f1d)"} diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index 1c7532acdc02e..8f16927ea1385 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1015,6 +1015,8 @@ unsafe extern "C" { Name: *const c_char, Error_message: *mut c_char, ); + pub(crate) fn LLVMCloneModule(M: &Module) -> &Module; + pub(crate) fn LLVMDisposeModule(M: &Module); /// Data layout. See Module::getDataLayout. pub(crate) fn LLVMGetDataLayoutStr(M: &Module) -> *const c_char; @@ -1182,6 +1184,12 @@ unsafe extern "C" { // Operations on global variables pub(crate) safe fn LLVMIsAGlobalVariable(GlobalVar: &Value) -> Option<&Value>; pub(crate) fn LLVMAddGlobal<'a>(M: &'a Module, Ty: &'a Type, Name: *const c_char) -> &'a Value; + pub(crate) fn LLVMAddGlobalInAddressSpace<'a>( + M: &'a Module, + Ty: &'a Type, + Name: *const c_char, + addrspace: c_uint, + ) -> &'a Value; pub(crate) fn LLVMGetNamedGlobal(M: &Module, Name: *const c_char) -> Option<&Value>; pub(crate) fn LLVMGetFirstGlobal(M: &Module) -> Option<&Value>; pub(crate) fn LLVMGetNextGlobal(GlobalVar: &Value) -> Option<&Value>; diff --git a/compiler/rustc_codegen_llvm/src/llvm/mod.rs b/compiler/rustc_codegen_llvm/src/llvm/mod.rs index 154ba4fd69018..a9035ef231f2d 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/mod.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/mod.rs @@ -251,6 +251,15 @@ pub(crate) fn add_global<'a>(llmod: &'a Module, ty: &'a Type, name_cstr: &CStr) unsafe { LLVMAddGlobal(llmod, ty, name_cstr.as_ptr()) } } +pub(crate) fn add_global_in_addrspace<'a>( + llmod: &'a Module, + ty: &'a Type, + name_cstr: &CStr, + addrspace: c_uint, +) -> &'a Value { + unsafe { LLVMAddGlobalInAddressSpace(llmod, ty, name_cstr.as_ptr(), addrspace) } +} + pub(crate) fn set_initializer(llglobal: &Value, constant_val: &Value) { unsafe { LLVMSetInitializer(llglobal, constant_val); From 5003a07f4493a4a3d2d5f90ec7bb3473cb1b4b6a Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 10 Jul 2025 15:09:34 -0700 Subject: [PATCH 3/6] fix device code --- .../src/builder/gpu_offload.rs | 2 +- .../src/builder/gpu_wrapper.rs | 73 ++++++++----------- 2 files changed, 33 insertions(+), 42 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index da7bde86463bc..b87806b8807b3 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -248,7 +248,7 @@ fn gen_define_handling<'ll>( o_types } -fn declare_offload_fn<'ll>( +pub(crate) fn declare_offload_fn<'ll>( cx: &'ll SimpleCx<'_>, name: &str, ty: &'ll llvm::Type, diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs index 037208d656a81..22aed90f8c188 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs @@ -1,12 +1,12 @@ use std::ffi::CString; use llvm::Linkage::*; -use rustc_abi::Align; +use rustc_abi::{AddressSpace, Align}; use rustc_codegen_ssa::back::write::CodegenContext; use rustc_codegen_ssa::traits::BaseTypeCodegenMethods; use crate::builder::gpu_offload::*; -use crate::llvm::{self, Visibility}; +use crate::llvm::{self, Linkage, Type, Value, Visibility}; use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx}; pub(crate) fn create_struct_ty<'ll>( @@ -22,6 +22,23 @@ pub(crate) fn create_struct_ty<'ll>( } } +pub(crate) fn add_global_decl<'ll>( + cx: &SimpleCx<'ll>, + ty: &'ll Type, + name: &str, + l: Linkage, + hidden: bool, +) -> &'ll llvm::Value { + let c_name = CString::new(name).unwrap(); + let llglobal: &'ll llvm::Value = llvm::add_global(cx.llmod, ty, &c_name); + llvm::set_global_constant(llglobal, true); + llvm::set_linkage(llglobal, l); + if hidden { + llvm::set_visibility(llglobal, Visibility::Hidden); + } + llglobal +} + // We don't copy types from other functions because we generate a new module and context. // Bringing in types from other contexts would likely cause issues. pub(crate) fn gen_image_wrapper_module(cgcx: &CodegenContext) { @@ -32,6 +49,7 @@ pub(crate) fn gen_image_wrapper_module(cgcx: &CodegenContext ModuleLlvm::new_simple(name, dl_cstr.into_raw(), target_cstr.into_raw(), &cgcx).unwrap(); let cx = SimpleCx::new(m.llmod(), m.llcx, cgcx.pointer_size); let tptr = cx.type_ptr(); + let tptr1 = cx.type_ptr_ext(AddressSpace(1)); let ti64 = cx.type_i64(); let ti32 = cx.type_i32(); let ti16 = cx.type_i16(); @@ -44,28 +62,22 @@ pub(crate) fn gen_image_wrapper_module(cgcx: &CodegenContext let offload_entry_ty = add_tgt_offload_entry(&cx); let offload_entry_arr = cx.type_array(offload_entry_ty, 0); - let c_name = CString::new("__start_omp_offloading_entries").unwrap(); - let llglobal = llvm::add_global(cx.llmod, offload_entry_arr, &c_name); - llvm::set_global_constant(llglobal, true); - llvm::set_linkage(llglobal, ExternalLinkage); - llvm::set_visibility(llglobal, Visibility::Hidden); - let c_name = CString::new("__stop_omp_offloading_entries").unwrap(); - let llglobal = llvm::add_global(cx.llmod, offload_entry_arr, &c_name); - llvm::set_global_constant(llglobal, true); - llvm::set_linkage(llglobal, ExternalLinkage); - llvm::set_visibility(llglobal, Visibility::Hidden); + let name = "__start_omp_offloading_entries"; + add_global_decl(&cx, offload_entry_arr, name, ExternalLinkage, true); + + let name = "__stop_omp_offloading_entries"; + add_global_decl(&cx, offload_entry_arr, name, ExternalLinkage, true); + + let name = "__dummy.omp_offloading_entries"; + let llglobal = add_global_decl(&cx, offload_entry_arr, name, InternalLinkage, false); - let c_name = CString::new("__dummy.omp_offloading_entries").unwrap(); - let llglobal = llvm::add_global(cx.llmod, offload_entry_arr, &c_name); - llvm::set_global_constant(llglobal, true); - llvm::set_linkage(llglobal, InternalLinkage); let c_section_name = CString::new("omp_offloading_entries").unwrap(); llvm::set_section(llglobal, &c_section_name); let zeroinit = cx.const_null(offload_entry_arr); llvm::set_initializer(llglobal, zeroinit); CString::new("llvm.compiler.used").unwrap(); - let arr_val = cx.const_array(tptr, &[llglobal]); + let arr_val = cx.const_array(tptr1, &[llglobal]); let c_section_name = CString::new("llvm.metadata").unwrap(); let llglobal = add_global(&cx, "llvm.compiler.used", arr_val, AppendingLinkage); llvm::set_section(llglobal, &c_section_name); @@ -74,30 +86,9 @@ pub(crate) fn gen_image_wrapper_module(cgcx: &CodegenContext //@llvm.compiler.used = appending global [1 x ptr] [ptr @__dummy.omp_offloading_entries], section "llvm.metadata" let mapper_fn_ty = cx.type_func(&[tptr], cx.type_void()); - crate::declare::declare_simple_fn( - &cx, - &"__tgt_unregister_lib", - llvm::CallConv::CCallConv, - llvm::UnnamedAddr::No, - llvm::Visibility::Default, - mapper_fn_ty, - ); - crate::declare::declare_simple_fn( - &cx, - &"__tgt_register_lib", - llvm::CallConv::CCallConv, - llvm::UnnamedAddr::No, - llvm::Visibility::Default, - mapper_fn_ty, - ); - crate::declare::declare_simple_fn( - &cx, - &"atexit", - llvm::CallConv::CCallConv, - llvm::UnnamedAddr::No, - llvm::Visibility::Default, - cx.type_func(&[tptr], ti32), - ); + declare_offload_fn(&cx, &"__tgt_register_lib", mapper_fn_ty); + declare_offload_fn(&cx, &"__tgt_unregister_lib", mapper_fn_ty); + declare_offload_fn(&cx, &"atexit", cx.type_func(&[tptr], ti32)); let unknown_txt = "11111111111111"; let c_entry_name = CString::new(unknown_txt).unwrap(); From 4273fb1773a9d81b92dcc93d3ed076b5a059ecde Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Tue, 22 Jul 2025 12:46:35 -0700 Subject: [PATCH 4/6] fixup --- compiler/rustc_codegen_llvm/src/back/write.rs | 2 +- compiler/rustc_codegen_llvm/src/builder/gpu_device.rs | 2 +- compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs | 2 +- compiler/rustc_codegen_llvm/src/llvm/ffi.rs | 1 - 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs index 6f8fba2a30dc3..4742e3f90bf3a 100644 --- a/compiler/rustc_codegen_llvm/src/back/write.rs +++ b/compiler/rustc_codegen_llvm/src/back/write.rs @@ -949,7 +949,7 @@ pub(crate) fn codegen( // binaries. So we must clone the module to produce the asm output // if we are also producing object code. let llmod = if let EmitObj::ObjectCode(_) = config.emit_obj { - llvm::LLVMCloneModule(llmod) + unsafe { llvm::LLVMCloneModule(llmod) } } else { llmod }; diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_device.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_device.rs index 63416743ca322..cb957872fec05 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_device.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_device.rs @@ -14,7 +14,7 @@ fn add_unnamed_global_in_addrspace<'ll>( addrspace: u32, ) -> &'ll llvm::Value { let llglobal = add_global_in_addrspace(cx, name, initializer, l, addrspace); - unsafe { llvm::LLVMSetUnnamedAddress(llglobal, llvm::UnnamedAddr::Global) }; + llvm::LLVMSetUnnamedAddress(llglobal, llvm::UnnamedAddr::Global); llglobal } diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs index 22aed90f8c188..f7cf6c906392f 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_wrapper.rs @@ -6,7 +6,7 @@ use rustc_codegen_ssa::back::write::CodegenContext; use rustc_codegen_ssa::traits::BaseTypeCodegenMethods; use crate::builder::gpu_offload::*; -use crate::llvm::{self, Linkage, Type, Value, Visibility}; +use crate::llvm::{self, Linkage, Type, Visibility}; use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx}; pub(crate) fn create_struct_ty<'ll>( diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index 8f16927ea1385..755cb0fe5501f 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1005,7 +1005,6 @@ unsafe extern "C" { ) -> MetadataKindId; // Create modules. - pub(crate) fn LLVMCloneModule(M: &Module) -> &Module; pub(crate) fn LLVMModuleCreateWithNameInContext( ModuleID: *const c_char, C: &Context, From 692facfd87c0d6972bd91c89bba8ab606ee5578a Mon Sep 17 00:00:00 2001 From: Flakebi Date: Fri, 28 Mar 2025 10:15:56 +0100 Subject: [PATCH 5/6] Fix linker-plugin-lto only doing thin lto When rust provides LLVM bitcode files to lld and the bitcode contains function summaries as used for thin lto, lld defaults to using thin lto. This prevents some optimizations that are only applied for fat lto. We solve this by not creating function summaries when fat lto is enabled. The bitcode for the module is just directly written out. An alternative solution would be to set the `ThinLTO=0` module flag to signal lld to do fat lto. The code in clang that sets this flag is here: https://github.com/llvm/llvm-project/blob/560149b5e3c891c64899e9912e29467a69dc3a4c/clang/lib/CodeGen/BackendUtil.cpp#L1150 The code in LLVM that queries the flag and defaults to thin lto if not set is here: https://github.com/llvm/llvm-project/blob/e258bca9505f35e0a22cb213a305eea9b76d11ea/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp#L4441-L4446 --- compiler/rustc_codegen_llvm/src/back/write.rs | 2 +- .../src/external_deps/llvm.rs | 30 +++++++++++++++++ .../src/external_deps/rustc.rs | 6 ++++ src/tools/run-make-support/src/lib.rs | 6 ++++ tests/run-make/cross-lang-lto-clang/rmake.rs | 27 ++++++++++++---- tests/run-make/fat-then-thin-lto/lib.rs | 8 +++++ tests/run-make/fat-then-thin-lto/main.rs | 11 +++++++ tests/run-make/fat-then-thin-lto/rmake.rs | 25 +++++++++++++++ tests/run-make/linker-plugin-lto-fat/ir.ll | 6 ++++ tests/run-make/linker-plugin-lto-fat/main.rs | 17 ++++++++++ tests/run-make/linker-plugin-lto-fat/rmake.rs | 32 +++++++++++++++++++ 11 files changed, 162 insertions(+), 8 deletions(-) create mode 100644 tests/run-make/fat-then-thin-lto/lib.rs create mode 100644 tests/run-make/fat-then-thin-lto/main.rs create mode 100644 tests/run-make/fat-then-thin-lto/rmake.rs create mode 100644 tests/run-make/linker-plugin-lto-fat/ir.ll create mode 100644 tests/run-make/linker-plugin-lto-fat/main.rs create mode 100644 tests/run-make/linker-plugin-lto-fat/rmake.rs diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs index 4742e3f90bf3a..0279cad574129 100644 --- a/compiler/rustc_codegen_llvm/src/back/write.rs +++ b/compiler/rustc_codegen_llvm/src/back/write.rs @@ -861,7 +861,7 @@ pub(crate) fn codegen( "LLVM_module_codegen_make_bitcode", &*module.name, ); - ThinBuffer::new(llmod, config.emit_thin_lto, false) + ThinBuffer::new(llmod, cgcx.lto != Lto::Fat && config.emit_thin_lto, false) }; let data = thin.data(); let _timer = cgcx diff --git a/src/tools/run-make-support/src/external_deps/llvm.rs b/src/tools/run-make-support/src/external_deps/llvm.rs index 9a6e35da3fe20..939160d9f41d8 100644 --- a/src/tools/run-make-support/src/external_deps/llvm.rs +++ b/src/tools/run-make-support/src/external_deps/llvm.rs @@ -60,6 +60,12 @@ pub fn llvm_pdbutil() -> LlvmPdbutil { LlvmPdbutil::new() } +/// Construct a new `llvm-as` invocation. This assumes that `llvm-as` is available +/// at `$LLVM_BIN_DIR/llvm-as`. +pub fn llvm_as() -> LlvmAs { + LlvmAs::new() +} + /// Construct a new `llvm-dis` invocation. This assumes that `llvm-dis` is available /// at `$LLVM_BIN_DIR/llvm-dis`. pub fn llvm_dis() -> LlvmDis { @@ -135,6 +141,13 @@ pub struct LlvmPdbutil { cmd: Command, } +/// A `llvm-as` invocation builder. +#[derive(Debug)] +#[must_use] +pub struct LlvmAs { + cmd: Command, +} + /// A `llvm-dis` invocation builder. #[derive(Debug)] #[must_use] @@ -158,6 +171,7 @@ crate::macros::impl_common_helpers!(LlvmNm); crate::macros::impl_common_helpers!(LlvmBcanalyzer); crate::macros::impl_common_helpers!(LlvmDwarfdump); crate::macros::impl_common_helpers!(LlvmPdbutil); +crate::macros::impl_common_helpers!(LlvmAs); crate::macros::impl_common_helpers!(LlvmDis); crate::macros::impl_common_helpers!(LlvmObjcopy); @@ -441,6 +455,22 @@ impl LlvmObjcopy { } } +impl LlvmAs { + /// Construct a new `llvm-as` invocation. This assumes that `llvm-as` is available + /// at `$LLVM_BIN_DIR/llvm-as`. + pub fn new() -> Self { + let llvm_as = llvm_bin_dir().join("llvm-as"); + let cmd = Command::new(llvm_as); + Self { cmd } + } + + /// Provide an input file. + pub fn input>(&mut self, path: P) -> &mut Self { + self.cmd.arg(path.as_ref()); + self + } +} + impl LlvmDis { /// Construct a new `llvm-dis` invocation. This assumes that `llvm-dis` is available /// at `$LLVM_BIN_DIR/llvm-dis`. diff --git a/src/tools/run-make-support/src/external_deps/rustc.rs b/src/tools/run-make-support/src/external_deps/rustc.rs index 08ba1388dc148..60d3366ee98c8 100644 --- a/src/tools/run-make-support/src/external_deps/rustc.rs +++ b/src/tools/run-make-support/src/external_deps/rustc.rs @@ -173,6 +173,12 @@ impl Rustc { self } + /// This flag enables LTO in the specified form. + pub fn lto(&mut self, option: &str) -> &mut Self { + self.cmd.arg(format!("-Clto={option}")); + self + } + /// This flag defers LTO optimizations to the linker. pub fn linker_plugin_lto(&mut self, option: &str) -> &mut Self { self.cmd.arg(format!("-Clinker-plugin-lto={option}")); diff --git a/src/tools/run-make-support/src/lib.rs b/src/tools/run-make-support/src/lib.rs index 29cd6c4ad1591..d6d2551ac59eb 100644 --- a/src/tools/run-make-support/src/lib.rs +++ b/src/tools/run-make-support/src/lib.rs @@ -61,6 +61,12 @@ pub use crate::external_deps::c_cxx_compiler::{ pub use crate::external_deps::cargo::cargo; pub use crate::external_deps::clang::{Clang, clang}; pub use crate::external_deps::htmldocck::htmldocck; +//pub use llvm::{ +// LlvmAr, LlvmBcanalyzer, LlvmDis, LlvmDwarfdump, LlvmFilecheck, LlvmNm, LlvmObjcopy, +// LlvmObjdump, LlvmProfdata, LlvmReadobj, llvm_ar, llvm_as, llvm_bcanalyzer, llvm_dis, +// llvm_dwarfdump, llvm_filecheck, llvm_nm, llvm_objcopy, llvm_objdump, llvm_profdata, +// llvm_readobj, +//}; pub use crate::external_deps::llvm::{ self, LlvmAr, LlvmBcanalyzer, LlvmDis, LlvmDwarfdump, LlvmFilecheck, LlvmNm, LlvmObjcopy, LlvmObjdump, LlvmProfdata, LlvmReadobj, llvm_ar, llvm_bcanalyzer, llvm_dis, llvm_dwarfdump, diff --git a/tests/run-make/cross-lang-lto-clang/rmake.rs b/tests/run-make/cross-lang-lto-clang/rmake.rs index 3fed6ea20667a..0c4383e2cd815 100644 --- a/tests/run-make/cross-lang-lto-clang/rmake.rs +++ b/tests/run-make/cross-lang-lto-clang/rmake.rs @@ -28,7 +28,16 @@ static C_NEVER_INLINED_PATTERN: &'static str = "bl.*"; static C_NEVER_INLINED_PATTERN: &'static str = "call.*c_never_inlined"; fn main() { + test_lto(false); + test_lto(true); +} + +fn test_lto(fat_lto: bool) { + let lto = if fat_lto { "fat" } else { "thin" }; + let clang_lto = if fat_lto { "full" } else { "thin" }; + rustc() + .lto(lto) .linker_plugin_lto("on") .output(static_lib_name("rustlib-xlto")) .opt_level("2") @@ -36,7 +45,7 @@ fn main() { .input("rustlib.rs") .run(); clang() - .lto("thin") + .lto(clang_lto) .use_ld("lld") .arg("-lrustlib-xlto") .out_exe("cmain") @@ -57,9 +66,10 @@ fn main() { .input("cmain") .run() .assert_stdout_contains_regex(RUST_NEVER_INLINED_PATTERN); - clang().input("clib.c").lto("thin").arg("-c").out_exe("clib.o").arg("-O2").run(); + clang().input("clib.c").lto(clang_lto).arg("-c").out_exe("clib.o").arg("-O2").run(); llvm_ar().obj_to_ar().output_input(static_lib_name("xyz"), "clib.o").run(); rustc() + .lto(lto) .linker_plugin_lto("on") .opt_level("2") .linker(&env_var("CLANG")) @@ -72,9 +82,12 @@ fn main() { .input("rsmain") .run() .assert_stdout_not_contains_regex(C_ALWAYS_INLINED_PATTERN); - llvm_objdump() - .disassemble() - .input("rsmain") - .run() - .assert_stdout_contains_regex(C_NEVER_INLINED_PATTERN); + + let dump = llvm_objdump().disassemble().input("rsmain").run(); + if !fat_lto { + dump.assert_stdout_contains_regex(C_NEVER_INLINED_PATTERN); + } else { + // fat lto inlines this anyway + dump.assert_stdout_not_contains_regex(C_NEVER_INLINED_PATTERN); + } } diff --git a/tests/run-make/fat-then-thin-lto/lib.rs b/tests/run-make/fat-then-thin-lto/lib.rs new file mode 100644 index 0000000000000..3091988368628 --- /dev/null +++ b/tests/run-make/fat-then-thin-lto/lib.rs @@ -0,0 +1,8 @@ +#![feature(no_core, lang_items)] +#![no_core] +#![crate_type = "rlib"] + +#[lang = "sized"] +trait Sized {} + +pub fn foo() {} diff --git a/tests/run-make/fat-then-thin-lto/main.rs b/tests/run-make/fat-then-thin-lto/main.rs new file mode 100644 index 0000000000000..a3f2e18158bc0 --- /dev/null +++ b/tests/run-make/fat-then-thin-lto/main.rs @@ -0,0 +1,11 @@ +#![allow(internal_features)] +#![feature(no_core, lang_items)] +#![no_core] +#![crate_type = "cdylib"] + +extern crate lib; + +#[unsafe(no_mangle)] +pub fn bar() { + lib::foo(); +} diff --git a/tests/run-make/fat-then-thin-lto/rmake.rs b/tests/run-make/fat-then-thin-lto/rmake.rs new file mode 100644 index 0000000000000..ef4f26689d4e8 --- /dev/null +++ b/tests/run-make/fat-then-thin-lto/rmake.rs @@ -0,0 +1,25 @@ +// Compile a library with lto=fat, then compile a binary with lto=thin +// and check that lto is applied with the library. +// The goal is to mimic the standard library being build with lto=fat +// and allowing users to build with lto=thin. + +//@ only-x86_64-unknown-linux-gnu + +use run_make_support::{dynamic_lib_name, llvm_objdump, rustc}; + +fn main() { + rustc().input("lib.rs").opt_level("3").lto("fat").run(); + rustc().input("main.rs").panic("abort").opt_level("3").lto("thin").run(); + + llvm_objdump() + .input(dynamic_lib_name("main")) + .arg("--disassemble-symbols=bar") + .run() + // The called function should be inlined. + // Check that we have a ret (to detect tail + // calls with a jmp) and no call. + .assert_stdout_contains("bar") + .assert_stdout_contains("ret") + .assert_stdout_not_contains("foo") + .assert_stdout_not_contains("call"); +} diff --git a/tests/run-make/linker-plugin-lto-fat/ir.ll b/tests/run-make/linker-plugin-lto-fat/ir.ll new file mode 100644 index 0000000000000..fa3dbdd4e088d --- /dev/null +++ b/tests/run-make/linker-plugin-lto-fat/ir.ll @@ -0,0 +1,6 @@ +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @ir_callee() { + ret void +} diff --git a/tests/run-make/linker-plugin-lto-fat/main.rs b/tests/run-make/linker-plugin-lto-fat/main.rs new file mode 100644 index 0000000000000..3a7b02f4b92e3 --- /dev/null +++ b/tests/run-make/linker-plugin-lto-fat/main.rs @@ -0,0 +1,17 @@ +#![feature(no_core, lang_items)] +#![no_core] +#![crate_type = "cdylib"] + +#[lang = "sized"] +trait Sized {} + +extern "C" { + fn ir_callee(); +} + +#[no_mangle] +extern "C" fn rs_foo() { + unsafe { + ir_callee(); + } +} diff --git a/tests/run-make/linker-plugin-lto-fat/rmake.rs b/tests/run-make/linker-plugin-lto-fat/rmake.rs new file mode 100644 index 0000000000000..0cfc799d2aaa1 --- /dev/null +++ b/tests/run-make/linker-plugin-lto-fat/rmake.rs @@ -0,0 +1,32 @@ +// Check that -C lto=fat with -C linker-plugin-lto actually works and can inline functions. +// A library is created from LLVM IR, defining a single function. Then a dylib is compiled, +// linking to the library and calling the function from the library. +// The function from the library should end up inlined and disappear from the output. + +//@ only-x86_64-unknown-linux-gnu +//@ needs-rust-lld + +use run_make_support::{dynamic_lib_name, llvm_as, llvm_objdump, rustc}; + +fn main() { + llvm_as().input("ir.ll").run(); + rustc() + .input("main.rs") + .opt_level("3") + .lto("fat") + .linker_plugin_lto("on") + .link_arg("ir.bc") + .arg("-Zlinker-features=+lld") + .run(); + + llvm_objdump() + .input(dynamic_lib_name("main")) + .arg("--disassemble-symbols=rs_foo") + .run() + // The called function should be inlined. + // Check that we have a ret (to detect tail + // calls with a jmp) and no call. + .assert_stdout_contains("foo") + .assert_stdout_contains("ret") + .assert_stdout_not_contains("call"); +} From 28b9090146dd6136a613f6447d2e15fdb460dfea Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 23 Jul 2025 16:49:39 -0700 Subject: [PATCH 6/6] disable cfg.has_reliable_f128 on amdgcn --- compiler/rustc_codegen_llvm/src/llvm_util.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/compiler/rustc_codegen_llvm/src/llvm_util.rs b/compiler/rustc_codegen_llvm/src/llvm_util.rs index 0fb987bdf82ed..3bcd417941e96 100644 --- a/compiler/rustc_codegen_llvm/src/llvm_util.rs +++ b/compiler/rustc_codegen_llvm/src/llvm_util.rs @@ -405,6 +405,8 @@ fn update_target_reliable_float_cfg(sess: &Session, cfg: &mut TargetConfig) { ("mips64" | "mips64r6", _) => false, // Selection bug ("nvptx64", _) => false, + // Unsupported https://github.com/llvm/llvm-project/issues/121122 + ("amdgpu", _) => false, // ABI bugs et al. (full // list at ) ("powerpc" | "powerpc64", _) => false,