Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Miscompilation corrupts stack-allocated vectors #63475

Closed
cbeuw opened this issue Jun 23, 2023 · 11 comments
Closed

Miscompilation corrupts stack-allocated vectors #63475

cbeuw opened this issue Jun 23, 2023 · 11 comments

Comments

@cbeuw
Copy link

cbeuw commented Jun 23, 2023

This should print 42 42 42 42 42 42 42 , but prints 42 0 42 0 42 42 42 with clang or opt -O3 https://godbolt.org/z/8v3d7enK8

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: noinline
define internal fastcc void @_ZN5repro11black_box_217h2c9aef2f201b1074E(i128 %val0, i64 %val1) #0 {
start:
  %0 = alloca [0 x [0 x [0 x i8]]], i32 0, align 1
  %_4 = icmp eq i128 %val0, 0
  %1 = zext i1 %_4 to i8
  store i8 %1, ptr %0, align 1
  call void asm sideeffect "", "r,~{memory}"(ptr %0)
  %_7 = icmp eq i64 %val1, 0
  %2 = zext i1 %_7 to i8
  store i8 %2, ptr %0, align 1
  call void asm sideeffect "", "r,~{memory}"(ptr null)
  ret void
}

define void @fn1(i1 %0) #1 {
start:
  %1 = alloca [0 x [0 x [0 x [5 x i32]]]], i32 0, align 4
  %2 = alloca [7 x i32], align 4
  store <7 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, ptr %2, align 4
  call fastcc void @_ZN5repro4fn1117ha0d291cafd330a2bE(i64 0, i128 0, ptr %1, i128 0, ptr %2)
  br i1 %0, label %bb2.preheader.i, label %_ZN5repro3fn517h51e49bf383c47da1E.exit

bb2.preheader.i:                                  ; preds = %start
  call fastcc void @_ZN5repro4fn1117ha0d291cafd330a2bE(i64 1, i128 1, ptr null, i128 1, ptr null)
  br label %_ZN5repro3fn517h51e49bf383c47da1E.exit

_ZN5repro3fn517h51e49bf383c47da1E.exit:           ; preds = %bb2.preheader.i, %start
  ret void
}

define internal fastcc void @_ZN5repro4fn1117ha0d291cafd330a2bE(i64 %_1, i128 %_3, ptr %_4, i128 %_5.1, ptr %_7) #1 personality ptr null {
start:
  %0 = alloca [0 x [0 x [0 x i8]]], i32 0, align 1
  %fmt.i = alloca [4 x i8], align 1
  %1 = alloca [5 x i32], align 4
  %2 = alloca [4 x i128], align 8
  %_8 = alloca [7 x i32], align 4
  %3 = load <7 x i32>, ptr %_7, align 4
  store <7 x i32> %3, ptr %_8, align 4
  tail call fastcc void @_ZN5repro11black_box_217h2c9aef2f201b1074E(i128 %_3, i64 %_1)
  store i128 %_5.1, ptr %2, align 8
  %4 = load i8, ptr %_4, align 1
  store i8 %4, ptr %1, align 4
  %bcmp.i.i.i = call i32 @bcmp(ptr %_4, ptr %_8, i64 28)
  %5 = icmp eq i32 %bcmp.i.i.i, 0
  %6 = zext i1 %5 to i8
  store i8 %6, ptr %_4, align 1
  call void asm sideeffect "", "r,~{memory}"(ptr %0)
  %bcmp.i.i3.i = call i32 @bcmp(ptr %_4, ptr %2, i64 64)
  %7 = icmp eq i32 %bcmp.i.i3.i, 0
  %8 = zext i1 %7 to i8
  store i8 %8, ptr %_4, align 1
  call void asm sideeffect "", "r,~{memory}"(ptr null)
  %bcmp.i.i4.i = call i32 @bcmp(ptr %_4, ptr %1, i64 20)
  %9 = icmp eq i32 %bcmp.i.i4.i, 0
  %10 = zext i1 %9 to i8
  store i8 %10, ptr %_4, align 1
  call void asm sideeffect "", "r,~{memory}"(ptr null)
  store i8 37, ptr %fmt.i, align 1
  %11 = getelementptr [4 x i8], ptr %fmt.i, i64 0, i64 1
  store i8 100, ptr %11, align 1
  %12 = getelementptr [4 x i8], ptr %fmt.i, i64 0, i64 2
  store i8 32, ptr %12, align 1
  %iter.i.sroa.10.16.vec.extract = extractelement <7 x i32> %3, i64 0
  %_44.i = call i32 (ptr, ...) @printf(ptr %fmt.i, i32 %iter.i.sroa.10.16.vec.extract)
  %iter.i.sroa.10.20.vec.extract = extractelement <7 x i32> %3, i64 1
  %_44.i.1 = call i32 (ptr, ...) @printf(ptr %fmt.i, i32 %iter.i.sroa.10.20.vec.extract)
  %iter.i.sroa.10.24.vec.extract = extractelement <7 x i32> %3, i64 2
  %_44.i.2 = call i32 (ptr, ...) @printf(ptr %fmt.i, i32 %iter.i.sroa.10.24.vec.extract)
  %iter.i.sroa.10.28.vec.extract = extractelement <7 x i32> %3, i64 3
  %_44.i.3 = call i32 (ptr, ...) @printf(ptr %fmt.i, i32 %iter.i.sroa.10.28.vec.extract)
  %iter.i.sroa.10.32.vec.extract = extractelement <7 x i32> %3, i64 4
  %_44.i.4 = call i32 (ptr, ...) @printf(ptr %fmt.i, i32 %iter.i.sroa.10.32.vec.extract)
  %iter.i.sroa.10.36.vec.extract = extractelement <7 x i32> %3, i64 5
  %_44.i.5 = call i32 (ptr, ...) @printf(ptr %fmt.i, i32 %iter.i.sroa.10.36.vec.extract)
  %_44.i.6 = call i32 (ptr, ...) @printf(ptr %fmt.i, i32 %iter.i.sroa.10.16.vec.extract)
  ret void
}

declare i32 @printf(ptr, ...)

define i32 @main() {
top:
  call void @fn1(i1 false)
  ret i32 0
}

declare i32 @bcmp(ptr, ptr, i64)

; uselistorder directives
uselistorder ptr null, { 1, 2, 6, 7, 0, 3, 4, 5 }
uselistorder ptr @_ZN5repro4fn1117ha0d291cafd330a2bE, { 1, 0 }
uselistorder ptr @printf, { 6, 5, 4, 3, 2, 1, 0 }
uselistorder ptr @bcmp, { 2, 1, 0 }

attributes #0 = { noinline }
attributes #1 = { "target-cpu"="x86-64" }

The above was from llvm-reduce. I don't know if it broke something so I attached the original IR below. This is compiled from Rust but I've patched out the symbols from Rust std so has no dependency on Rust.

original IR
; ModuleID = 'repro.46f743e1561fb24e-cgu.0'
source_filename = "repro.46f743e1561fb24e-cgu.0"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%Adt56 = type { { i128, i128, { i16, i128 }, i64, i32, [1 x i32] }, %Adt55 }
%Adt55 = type { %Adt54 }
%Adt54 = type { { i128, ptr }, { i128, i128, { i16, i128 }, i64, i32, [1 x i32] } }

@vtable.0 = private unnamed_addr constant <{ ptr, [16 x i8], ptr, ptr, ptr }> <{ ptr @"_ZN4core3ptr85drop_in_place$LT$std..rt..lang_start$LT$$LP$$RP$$GT$..$u7b$$u7b$closure$u7d$$u7d$$GT$17h0eee5ecdc5932091E", [16 x i8] c"\08\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00", ptr @"_ZN4core3ops8function6FnOnce40call_once$u7b$$u7b$vtable.shim$u7d$$u7d$17hf0de4a394f8e37a1E", ptr @"_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17h991b85cf75f57f3aE", ptr @"_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17h991b85cf75f57f3aE" }>, align 8
@alloc_a00f8a95864fc305bf508c11187211d8 = private unnamed_addr constant <{ [28 x i8] }> zeroinitializer, align 4
@alloc_4f40612ab7406a7d1f3f0640c8ea0fb4 = private unnamed_addr constant <{ [64 x i8] }> zeroinitializer, align 8
@alloc_ee0548ff1320ae5be168b83ab0b060cd = private unnamed_addr constant <{ [20 x i8] }> <{ [20 x i8] c"a\00\00\00a\00\00\00a\00\00\00a\00\00\00a\00\00\00" }>, align 4

; std::sys_common::backtrace::__rust_begin_short_backtrace
; Function Attrs: noinline nonlazybind uwtable
define internal fastcc void @_ZN3std10sys_common9backtrace28__rust_begin_short_backtrace17h3230664098c98715E(ptr nocapture noundef nonnull readonly %f) unnamed_addr #0 {
start:
  tail call void %f()
  tail call void asm sideeffect "", "~{memory}"() #10, !srcloc !3
  ret void
}


; std::rt::lang_start::{{closure}}
; Function Attrs: inlinehint nonlazybind uwtable
define internal noundef i32 @"_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17h991b85cf75f57f3aE"(ptr noalias nocapture noundef readonly align 8 dereferenceable(8) %_1) unnamed_addr #2 {
start:
  %_4 = load ptr, ptr %_1, align 8, !nonnull !4, !noundef !4
; call std::sys_common::backtrace::__rust_begin_short_backtrace
  tail call fastcc void @_ZN3std10sys_common9backtrace28__rust_begin_short_backtrace17h3230664098c98715E(ptr noundef nonnull %_4)
  ret i32 0
}

; core::ops::function::FnOnce::call_once{{vtable.shim}}
; Function Attrs: inlinehint nonlazybind uwtable
define internal noundef i32 @"_ZN4core3ops8function6FnOnce40call_once$u7b$$u7b$vtable.shim$u7d$$u7d$17hf0de4a394f8e37a1E"(ptr nocapture noundef readonly %_1) unnamed_addr #2 personality ptr @rust_eh_personality {
start:
  %0 = load ptr, ptr %_1, align 8, !nonnull !4, !noundef !4
; call std::sys_common::backtrace::__rust_begin_short_backtrace
  tail call fastcc void @_ZN3std10sys_common9backtrace28__rust_begin_short_backtrace17h3230664098c98715E(ptr noundef nonnull %0), !noalias !5
  ret i32 0
}

; core::ptr::drop_in_place<std::rt::lang_start<()>::{{closure}}>
; Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(none) uwtable
define internal void @"_ZN4core3ptr85drop_in_place$LT$std..rt..lang_start$LT$$LP$$RP$$GT$..$u7b$$u7b$closure$u7d$$u7d$$GT$17h0eee5ecdc5932091E"(ptr noalias nocapture readnone align 8 %_1) unnamed_addr #3 {
start:
  ret void
}

; repro::black_box_1
; Function Attrs: noinline nonlazybind uwtable
define internal fastcc void @_ZN5repro11black_box_117h2948a258b3403becE(ptr noalias nocapture noundef readonly dereferenceable(28) %val3) unnamed_addr #0 {
start:
  %0 = alloca i8, align 1
  %bcmp.i.i = tail call i32 @bcmp(ptr noundef nonnull dereferenceable(28) @alloc_a00f8a95864fc305bf508c11187211d8, ptr noundef nonnull dereferenceable(28) %val3, i64 28)
  %1 = icmp eq i32 %bcmp.i.i, 0
  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %0)
  %2 = zext i1 %1 to i8
  store i8 %2, ptr %0, align 1
  call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) #10, !srcloc !3
  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %0)
  ret void
}

; repro::black_box_2
; Function Attrs: noinline nonlazybind uwtable
define internal fastcc void @_ZN5repro11black_box_217h2c9aef2f201b1074E(i128 noundef %val0, i64 noundef %val1) unnamed_addr #0 {
start:
  %0 = alloca i8, align 1
  %1 = alloca i8, align 1
  %_4 = icmp eq i128 %val0, 0
  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %1)
  %2 = zext i1 %_4 to i8
  store i8 %2, ptr %1, align 1
  call void asm sideeffect "", "r,~{memory}"(ptr nonnull %1) #10, !srcloc !3
  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %1)
  %_7 = icmp eq i64 %val1, 0
  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %0)
  %3 = zext i1 %_7 to i8
  store i8 %3, ptr %0, align 1
  call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) #10, !srcloc !3
  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %0)
  ret void
}

; Function Attrs: nonlazybind uwtable
define dso_local void @fn1() unnamed_addr #1 {
start:
  %0 = alloca [7 x i32], align 4
  %1 = alloca [5 x i32], align 4
  %2 = alloca [7 x i32], align 4
  %3 = alloca [5 x i32], align 4
  %_2.i = alloca %Adt56, align 8
  %_1 = alloca [7 x i32], align 4
  store i32 42, ptr %_1, align 4
  %4 = getelementptr inbounds i32, ptr %_1, i64 1
  store i32 42, ptr %4, align 4
  %5 = getelementptr inbounds i32, ptr %_1, i64 2
  store i32 42, ptr %5, align 4
  %6 = getelementptr inbounds i32, ptr %_1, i64 3
  store i32 42, ptr %6, align 4
  %7 = getelementptr inbounds i32, ptr %_1, i64 4
  store i32 42, ptr %7, align 4
  %8 = getelementptr inbounds i32, ptr %_1, i64 5
  store i32 42, ptr %8, align 4
  %9 = getelementptr inbounds i32, ptr %_1, i64 6
  store i32 42, ptr %9, align 4
  %10 = load <7 x i32>, ptr %_1, align 4
  call void @llvm.lifetime.start.p0(i64 168, ptr nonnull %_2.i)
  %11 = getelementptr inbounds %Adt56, ptr %_2.i, i64 0, i32 1, i32 0, i32 1
  %12 = getelementptr inbounds %Adt56, ptr %_2.i, i64 0, i32 1, i32 0, i32 1, i32 3
  store i64 0, ptr %12, align 8, !noalias !8
  %13 = getelementptr inbounds %Adt56, ptr %_2.i, i64 0, i32 1, i32 0, i32 1, i32 2
  store i16 -21983, ptr %13, align 8, !noalias !8
  %14 = getelementptr inbounds %Adt56, ptr %_2.i, i64 0, i32 1, i32 0, i32 1, i32 2, i32 1
  store i128 0, ptr %14, align 8, !noalias !8
  store i128 0, ptr %11, align 8, !noalias !8
  %15 = getelementptr inbounds %Adt56, ptr %_2.i, i64 0, i32 1, i32 0, i32 0, i32 1
  store ptr %_2.i, ptr %15, align 8, !noalias !8
  call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %3), !noalias !8
  store i32 97, ptr %3, align 4, !noalias !8
  %_3.sroa.3.0..sroa_idx.i = getelementptr inbounds i8, ptr %3, i64 4
  store i32 97, ptr %_3.sroa.3.0..sroa_idx.i, align 4, !noalias !8
  %_3.sroa.4.0..sroa_idx.i = getelementptr inbounds i8, ptr %3, i64 8
  store i32 97, ptr %_3.sroa.4.0..sroa_idx.i, align 4, !noalias !8
  %_3.sroa.5.0..sroa_idx.i = getelementptr inbounds i8, ptr %3, i64 12
  store i32 97, ptr %_3.sroa.5.0..sroa_idx.i, align 4, !noalias !8
  %_3.sroa.6.0..sroa_idx.i = getelementptr inbounds i8, ptr %3, i64 16
  store i32 97, ptr %_3.sroa.6.0..sroa_idx.i, align 4, !noalias !8
  call void @llvm.lifetime.start.p0(i64 28, ptr nonnull %2), !noalias !8
  store <7 x i32> %10, ptr %2, align 4, !noalias !8
; call repro::fn11
  call fastcc void @_ZN5repro4fn1117ha0d291cafd330a2bE(i64 noundef 0, i128 noundef 0, ptr noalias nocapture noundef nonnull readonly dereferenceable(20) %3, i128 noundef 0, ptr noalias nocapture noundef nonnull readonly dereferenceable(28) %2)
  call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %3), !noalias !8
  call void @llvm.lifetime.end.p0(i64 28, ptr nonnull %2), !noalias !8
  %16 = load i16, ptr %13, align 8, !noalias !8, !noundef !4
  %17 = icmp eq i16 %16, 2
  br i1 %17, label %bb2.preheader.i, label %_ZN5repro3fn517h51e49bf383c47da1E.exit

bb2.preheader.i:                                  ; preds = %start
  %_3.sroa.3.0..sroa_idx3.i = getelementptr inbounds i8, ptr %1, i64 4
  %_3.sroa.4.0..sroa_idx5.i = getelementptr inbounds i8, ptr %1, i64 8
  %_3.sroa.5.0..sroa_idx7.i = getelementptr inbounds i8, ptr %1, i64 12
  %_3.sroa.6.0..sroa_idx9.i = getelementptr inbounds i8, ptr %1, i64 16
  br label %bb2.i

bb2.i:                                            ; preds = %bb2.i, %bb2.preheader.i
  %18 = load i64, ptr %12, align 8, !noalias !8, !noundef !4
  %19 = load i128, ptr %11, align 8, !noalias !8, !noundef !4
  call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %1), !noalias !8
  store i32 97, ptr %1, align 4, !noalias !8
  store i32 97, ptr %_3.sroa.3.0..sroa_idx3.i, align 4, !noalias !8
  store i32 97, ptr %_3.sroa.4.0..sroa_idx5.i, align 4, !noalias !8
  store i32 97, ptr %_3.sroa.5.0..sroa_idx7.i, align 4, !noalias !8
  store i32 97, ptr %_3.sroa.6.0..sroa_idx9.i, align 4, !noalias !8
  %20 = load i128, ptr %14, align 8, !noalias !8, !noundef !4
  call void @llvm.lifetime.start.p0(i64 28, ptr nonnull %0), !noalias !8
  store <7 x i32> %10, ptr %0, align 4, !noalias !8
; call repro::fn11
  call fastcc void @_ZN5repro4fn1117ha0d291cafd330a2bE(i64 noundef %18, i128 noundef %19, ptr noalias nocapture noundef nonnull readonly dereferenceable(20) %1, i128 noundef %20, ptr noalias nocapture noundef nonnull readonly dereferenceable(28) %0)
  call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %1), !noalias !8
  call void @llvm.lifetime.end.p0(i64 28, ptr nonnull %0), !noalias !8
  %21 = load i16, ptr %13, align 8, !noalias !8, !noundef !4
  %22 = icmp eq i16 %21, 2
  br i1 %22, label %bb2.i, label %_ZN5repro3fn517h51e49bf383c47da1E.exit

_ZN5repro3fn517h51e49bf383c47da1E.exit:           ; preds = %bb2.i, %start
  call void @llvm.lifetime.end.p0(i64 168, ptr nonnull %_2.i)
; call repro::black_box_1
  call fastcc void @_ZN5repro11black_box_117h2948a258b3403becE(ptr noalias nocapture noundef nonnull readonly dereferenceable(28) %_1)
  ret void
}

; repro::fn11
; Function Attrs: nonlazybind uwtable
define internal fastcc void @_ZN5repro4fn1117ha0d291cafd330a2bE(i64 noundef %_1, i128 noundef %_3, ptr noalias nocapture noundef readonly dereferenceable(20) %_4, i128 noundef %_5.1, ptr noalias nocapture noundef readonly dereferenceable(28) %_7) unnamed_addr #1 personality ptr @rust_eh_personality {
start:
  %0 = alloca i8, align 1
  %1 = alloca i8, align 1
  %2 = alloca i8, align 1
  %3 = alloca i8, align 1
  %4 = alloca i8, align 1
  %5 = alloca i8, align 1
  %lf.i = alloca [2 x i8], align 1
  %fmt.i = alloca [4 x i8], align 1
  %6 = alloca [5 x i32], align 4
  %7 = alloca [4 x i128], align 8
  %_8 = alloca [7 x i32], align 4
  %8 = load <7 x i32>, ptr %_7, align 4
  store <7 x i32> %8, ptr %_8, align 4
; call repro::black_box_2
  tail call fastcc void @_ZN5repro11black_box_217h2c9aef2f201b1074E(i128 noundef %_3, i64 noundef %_1)
  call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %7)
  %_12.sroa.3.0..sroa_idx = getelementptr inbounds i8, ptr %7, i64 32
  call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %7, i8 0, i64 32, i1 false)
  store i128 %_5.1, ptr %_12.sroa.3.0..sroa_idx, align 8
  %_12.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %7, i64 48
  store i128 0, ptr %_12.sroa.4.0..sroa_idx, align 8
  call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %6)
  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(20) %6, ptr noundef nonnull align 4 dereferenceable(20) %_4, i64 20, i1 false)
  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %5), !noalias !11
  store i8 0, ptr %5, align 1, !noalias !11
  call void asm sideeffect "", "r,~{memory}"(ptr nonnull %5) #10, !noalias !11, !srcloc !3
  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %5), !noalias !11
  %bcmp.i.i.i = call i32 @bcmp(ptr noundef nonnull dereferenceable(28) @alloc_a00f8a95864fc305bf508c11187211d8, ptr noundef nonnull dereferenceable(28) %_8, i64 28), !noalias !16
  %9 = icmp eq i32 %bcmp.i.i.i, 0
  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %4), !noalias !11
  %10 = zext i1 %9 to i8
  store i8 %10, ptr %4, align 1, !noalias !11
  call void asm sideeffect "", "r,~{memory}"(ptr nonnull %4) #10, !noalias !11, !srcloc !3
  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %4), !noalias !11
  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %3), !noalias !11
  store i8 1, ptr %3, align 1, !noalias !11
  call void asm sideeffect "", "r,~{memory}"(ptr nonnull %3) #10, !noalias !11, !srcloc !3
  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %3), !noalias !11
  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %2), !noalias !11
  store i8 1, ptr %2, align 1, !noalias !11
  call void asm sideeffect "", "r,~{memory}"(ptr nonnull %2) #10, !noalias !11, !srcloc !3
  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %2), !noalias !11
  %bcmp.i.i3.i = call i32 @bcmp(ptr noundef nonnull dereferenceable(64) @alloc_4f40612ab7406a7d1f3f0640c8ea0fb4, ptr noundef nonnull dereferenceable(64) %7, i64 64), !noalias !17
  %11 = icmp eq i32 %bcmp.i.i3.i, 0
  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %1), !noalias !11
  %12 = zext i1 %11 to i8
  store i8 %12, ptr %1, align 1, !noalias !11
  call void asm sideeffect "", "r,~{memory}"(ptr nonnull %1) #10, !noalias !11, !srcloc !3
  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %1), !noalias !11
  %bcmp.i.i4.i = call i32 @bcmp(ptr noundef nonnull dereferenceable(20) @alloc_ee0548ff1320ae5be168b83ab0b060cd, ptr noundef nonnull dereferenceable(20) %6, i64 20), !noalias !18
  %13 = icmp eq i32 %bcmp.i.i4.i, 0
  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %0), !noalias !11
  %14 = zext i1 %13 to i8
  store i8 %14, ptr %0, align 1, !noalias !11
  call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) #10, !noalias !11, !srcloc !3
  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %0), !noalias !11
  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %fmt.i), !noalias !11
  store i8 37, ptr %fmt.i, align 1, !noalias !11
  %15 = getelementptr inbounds [4 x i8], ptr %fmt.i, i64 0, i64 1
  store i8 100, ptr %15, align 1, !noalias !11
  %16 = getelementptr inbounds [4 x i8], ptr %fmt.i, i64 0, i64 2
  store i8 32, ptr %16, align 1, !noalias !11
  %17 = getelementptr inbounds [4 x i8], ptr %fmt.i, i64 0, i64 3
  store i8 0, ptr %17, align 1, !noalias !11
  %iter.i.sroa.10.16.vec.extract = extractelement <7 x i32> %8, i64 0
  %_44.i = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %fmt.i, i32 noundef %iter.i.sroa.10.16.vec.extract), !noalias !11
  %iter.i.sroa.10.20.vec.extract = extractelement <7 x i32> %8, i64 1
  %_44.i.1 = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %fmt.i, i32 noundef %iter.i.sroa.10.20.vec.extract), !noalias !11
  %iter.i.sroa.10.24.vec.extract = extractelement <7 x i32> %8, i64 2
  %_44.i.2 = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %fmt.i, i32 noundef %iter.i.sroa.10.24.vec.extract), !noalias !11
  %iter.i.sroa.10.28.vec.extract = extractelement <7 x i32> %8, i64 3
  %_44.i.3 = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %fmt.i, i32 noundef %iter.i.sroa.10.28.vec.extract), !noalias !11
  %iter.i.sroa.10.32.vec.extract = extractelement <7 x i32> %8, i64 4
  %_44.i.4 = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %fmt.i, i32 noundef %iter.i.sroa.10.32.vec.extract), !noalias !11
  %iter.i.sroa.10.36.vec.extract = extractelement <7 x i32> %8, i64 5
  %_44.i.5 = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %fmt.i, i32 noundef %iter.i.sroa.10.36.vec.extract), !noalias !11
  %iter.i.sroa.10.40.vec.extract = extractelement <7 x i32> %8, i64 6
  %_44.i.6 = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %fmt.i, i32 noundef %iter.i.sroa.10.40.vec.extract), !noalias !11
  call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %lf.i), !noalias !11
  store i8 10, ptr %lf.i, align 1, !noalias !11
  %18 = getelementptr inbounds [2 x i8], ptr %lf.i, i64 0, i64 1
  store i8 0, ptr %18, align 1, !noalias !11
  %_50.i = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %lf.i), !noalias !11
  call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %lf.i), !noalias !11
  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %fmt.i), !noalias !11
  call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %7)
  call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %6)
  ret void
}

; repro::main
; Function Attrs: nonlazybind uwtable
define internal void @_ZN5repro4main17hc1028cd349e9622cE() unnamed_addr #1 {
start:
  tail call void @fn1()
  ret void
}

; Function Attrs: nonlazybind uwtable
define internal i32 @rust_eh_personality(i32 noundef, i32 noundef, i64 noundef, ptr noundef, ptr noundef) unnamed_addr #1 {
start:
  ret i32 0
}

; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite)
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #4

; Function Attrs: nofree nounwind nonlazybind uwtable
declare noundef i32 @printf(ptr nocapture noundef readonly, ...) unnamed_addr #5

; Function Attrs: nonlazybind
define i32 @main(i32 %0, ptr %1) unnamed_addr #6 {
top:
  call void @fn1()
  ret i32 0
}

; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #7

; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #7

; Function Attrs: nofree nounwind nonlazybind willreturn memory(argmem: read)
declare i32 @bcmp(ptr nocapture, ptr nocapture, i64) local_unnamed_addr #8

; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #9

attributes #0 = { noinline nonlazybind uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" }
attributes #1 = { nonlazybind uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" }
attributes #2 = { inlinehint nonlazybind uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" }
attributes #3 = { inlinehint mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(none) uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" }
attributes #4 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite) }
attributes #5 = { nofree nounwind nonlazybind uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" }
attributes #6 = { nonlazybind "probe-stack"="inline-asm" "target-cpu"="x86-64" }
attributes #7 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #8 = { nofree nounwind nonlazybind willreturn memory(argmem: read) }
attributes #9 = { nocallback nofree nounwind willreturn memory(argmem: write) }
attributes #10 = { nounwind }

!llvm.module.flags = !{!0, !1, !2}

!0 = !{i32 8, !"PIC Level", i32 2}
!1 = !{i32 7, !"PIE Level", i32 2}
!2 = !{i32 2, !"RtLibUseGOT", i32 1}
!3 = !{i32 704612}
!4 = !{}
!5 = !{!6}
!6 = distinct !{!6, !7, !"_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17h991b85cf75f57f3aE: %_1"}
!7 = distinct !{!7, !"_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17h991b85cf75f57f3aE"}
!8 = !{!9}
!9 = distinct !{!9, !10, !"_ZN5repro3fn517h51e49bf383c47da1E: %_1"}
!10 = distinct !{!10, !"_ZN5repro3fn517h51e49bf383c47da1E"}
!11 = !{!12, !14, !15}
!12 = distinct !{!12, !13, !"_ZN5repro8dump_var17h8ee3b87d4b793436E: %val0"}
!13 = distinct !{!13, !"_ZN5repro8dump_var17h8ee3b87d4b793436E"}
!14 = distinct !{!14, !13, !"_ZN5repro8dump_var17h8ee3b87d4b793436E: %val2"}
!15 = distinct !{!15, !13, !"_ZN5repro8dump_var17h8ee3b87d4b793436E: %val3"}
!16 = !{!14, !15}
!17 = !{!12, !15}
!18 = !{!12, !14}
@efriedma-quic
Copy link
Collaborator

I don't think your reduction is correct; it looks like it involves accessing zero-byte allocations.

Generally, the first tool I reach for to reduce miscompiles is opt-bisect-limit (https://llvm.org/docs/OptBisect.html).

@cbeuw
Copy link
Author

cbeuw commented Jun 24, 2023

I removed all the zero-byte allocas: https://godbolt.org/z/jEbPc1P94

@nikic
Copy link
Contributor

nikic commented Jun 27, 2023

Looks like there is an ABI mismatch. The arguments are pushed via pushq at 8 byte offsets and then read via movl at 4 byte offsets.

@nikic
Copy link
Contributor

nikic commented Jun 27, 2023

Here's a reduction:

define void @caller() nounwind {
  call void @callee(ptr null, ptr null, ptr null, ptr null, ptr null, ptr null, <7 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>)
  ret void
}

define void @callee(ptr %p0, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, <7 x i32> %arg) nounwind {
start:
  %alloca = alloca [7 x i32], align 4
  store <7 x i32> %arg, ptr %alloca, align 4
  %extract0 = extractelement <7 x i32> %arg, i64 0
  call void @use(i32 %extract0)
  %extract1 = extractelement <7 x i32> %arg, i64 1
  call void @use(i32 %extract1)
  %extract2 = extractelement <7 x i32> %arg, i64 2
  call void @use(i32 %extract2)
  %extract3 = extractelement <7 x i32> %arg, i64 3
  call void @use(i32 %extract3)
  %extract4 = extractelement <7 x i32> %arg, i64 4
  call void @use(i32 %extract4)
  %extract5 = extractelement <7 x i32> %arg, i64 5
  call void @use(i32 %extract5)
  %extract6 = extractelement <7 x i32> %arg, i64 6
  call void @use(i32 %extract6)
  %extract7 = extractelement <7 x i32> %arg, i64 7
  call void @use(i32 %extract7)
  ret void
}

declare void @use(i32)

The caller does:

	pushq	$42
	pushq	$42
	pushq	$42
	pushq	$42
	pushq	$42
	pushq	$42
	pushq	$42
	callq	callee@PLT

The callee does:

	movl	112(%rsp), %ebx
	movl	104(%rsp), %ebp
	movl	96(%rsp), %r14d
	movl	76(%rsp), %r15d
	movl	72(%rsp), %r12d
	movl	64(%rsp), %edi
	movl	68(%rsp), %r13d

If we drop the store, then the offsets are correct (don't mind the different base):

	movl	144(%rsp), %ebx
	movl	136(%rsp), %ebp
	movl	128(%rsp), %r14d
	movl	120(%rsp), %r15d
	movl	112(%rsp), %r12d
	movl	104(%rsp), %r13d
	movl	96(%rsp), %edi

So this is again in some way related to the arg copy elision optimization.

@nikic nikic self-assigned this Jun 27, 2023
@nikic
Copy link
Contributor

nikic commented Jun 29, 2023

This seems to be related to the code in X86ISelLowering::LowerMemArgument() handling isCopyElisionCandidate(). It checks for ScalarizedAndExtendedVector, but does so by inspecting the size of the LocVT. However, if I'm understanding this right, in this case the LocVT is i32 matching the vector size, but this doesn't match the size of the stack slot, which is 8.

I'm not sure if there's any easy way to access that stack slot size though... CCAssignVal only stores the start offset.

@nikic
Copy link
Contributor

nikic commented Jun 29, 2023

Candidate patch: https://reviews.llvm.org/D154078

@llvmbot
Copy link
Member

llvmbot commented Jun 29, 2023

@llvm/issue-subscribers-backend-x86

@nikic
Copy link
Contributor

nikic commented Jul 3, 2023

@cbeuw Do you have the original Rust code that lead to this issue? I find it suspicious that we end up with illegal vector types in optimized IR -- unless you did something with repr(simd) I don't think that's supposed to happen.

@cbeuw
Copy link
Author

cbeuw commented Jul 3, 2023

@nikic I have the unreduced code in custom MIR: https://godbolt.org/z/7q6q8eK96. But I don't have the reduced one around any more... I'm happy to run the minimisation script again though if needed.

This isn't reproducible from surface Rust, which is why I opened a bug report with LLVM directly. The reproduction required a Move operand of an array local in a function call, where the same local was previously used. This MIR cannot be built from surface Rust as MIR building creates temporary copies for all Move operands in Call. The local that gets moved is assigned to and used exactly once. If you change Move(_16) to _16 on line 3200 then the bug goes away.

@cbeuw
Copy link
Author

cbeuw commented Jul 3, 2023

By illegal vector types do you mean the zero-byte [0 x [0 x [0 x i8]]]s? They weren't from rustc, they were from llvm-reduce. The IR from Rust was folded under original IR in the OP.

@nikic
Copy link
Contributor

nikic commented Jul 3, 2023

"Illegal vector type" here refers to the non-power-of-two vectors, which are not natively supported by the target. They are already part of the input IR, and the most likely culprit for that is rust-lang/rust#111999.

I wonder whether it would make sense to prevent argument promotion for such types, as the legalized argument passing for such vectors can be substantially worse than just passing them indirectly (and it makes it more likely to hit legalization bugs like #63608).

@nikic nikic closed this as completed in 7025ac8 Jul 13, 2023
veselypeta pushed a commit to veselypeta/cherillvm that referenced this issue Sep 6, 2024
When eliding argument copies, the memory layout between a plain
store of the type and the layout of the argument lowering on the
stack must match. For multi-part argument lowerings, this is not
necessarily the case.

The code already tried to prevent this optimization for "scalarized
and extended" vectors, but the check for "extends" was incomplete.
While a scalarized vector of i32s stores i32 values on the stack,
these are stored in 8 byte stack slots (on x86_64), so effectively
have padding.

Rather than trying to add more special cases to handle this (which
is not straightforward), I'm going in the other direction and
exclude scalarized vectors from this optimization entirely. This
seems like a rare case that is not worth the hassle -- the complete
lack of test coverage is not reassuring either.

Fixes llvm/llvm-project#63475.

Differential Revision: https://reviews.llvm.org/D154078
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

6 participants